### Import Required Packages and Set Options

In [1]:
import os
import sys

import numpy as np
import pandas as pd
import multiprocessing as mp

import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime
from itertools import combinations
from functools import partial
from numpy.linalg import norm

import jax.numpy as jnp
from jax import grad, jit, vmap

In [2]:
REPO_ROOT = "/Users/ericlundquist/Repos/rankfm"
DATA_ROOT = os.path.join(REPO_ROOT, "data/ml-100k")

### Load/Prepare Example Data

#### Prepare Users Data

In [3]:
users_df = pd.read_csv(os.path.join(DATA_ROOT, "users.csv"))
users_df['agegroup'] = pd.cut(users_df['age'], [0, 30, 45, 100], right=False, labels=False)
users_df = users_df.drop(['age', 'zip_code'], axis=1)
users_df = pd.get_dummies(users_df, prefix_sep='__', columns=['agegroup', 'gender', 'occupation'])
users_df.mean()

user_id                      472.000000
agegroup__0                    0.433722
agegroup__1                    0.348887
agegroup__2                    0.217391
gender__F                      0.289502
gender__M                      0.710498
occupation__administrator      0.083775
occupation__artist             0.029692
occupation__doctor             0.007423
occupation__educator           0.100742
occupation__engineer           0.071050
occupation__entertainment      0.019088
occupation__executive          0.033934
occupation__healthcare         0.016967
occupation__homemaker          0.007423
occupation__lawyer             0.012725
occupation__librarian          0.054083
occupation__marketing          0.027572
occupation__none               0.009544
occupation__other              0.111347
occupation__programmer         0.069989
occupation__retired            0.014846
occupation__salesman           0.012725
occupation__scientist          0.032874
occupation__student            0.207847


#### Prepare Items Data

In [4]:
items_df = pd.read_csv(os.path.join(DATA_ROOT, "items.csv"))
item_names = items_df[['item_id', 'item_name']]
item_names.head()

Unnamed: 0,item_id,item_name
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [5]:
items_df = items_df.drop(['item_name', 'release_date'], axis=1)
items_df.columns = ['item_id'] + ["genre__{}".format(col) for col in items_df.columns[1:]]
items_df.mean()

item_id               841.500000
genre__action           0.149227
genre__adventure        0.080262
genre__animation        0.024970
genre__childrens        0.072533
genre__comedy           0.300238
genre__crime            0.064804
genre__documentary      0.029727
genre__drama            0.431034
genre__fantasy          0.013080
genre__film_noir        0.014269
genre__horror           0.054697
genre__musical          0.033294
genre__mystery          0.036266
genre__romance          0.146849
genre__scifi            0.060048
genre__thriller         0.149227
genre__war              0.042212
genre__western          0.016052
dtype: float64

#### Prepare Ratings Data

In [6]:
ratings_explicit = pd.read_csv(os.path.join(DATA_ROOT, "ratings.csv"))
ratings_explicit['timestamp'] = pd.to_datetime(ratings_explicit['unix_timestamp'], origin='unix', unit='s')
ratings_explicit['positive_feedback'] = ratings_explicit.groupby('user_id')['rating'].transform(lambda c: np.where(c > c.mean(), 1, 0))
ratings_explicit = ratings_explicit.drop('unix_timestamp', axis=1)
ratings_explicit.mean()

user_id              462.48475
item_id              425.53013
rating                 3.52986
positive_feedback      0.54194
dtype: float64

In [7]:
ratings_explicit.head()

Unnamed: 0,user_id,item_id,rating,timestamp,positive_feedback
0,196,242,3,1997-12-04 15:55:49,0
1,186,302,3,1998-04-04 19:22:22,0
2,22,377,1,1997-11-07 07:18:36,0
3,244,51,2,1997-11-27 05:02:03,0
4,166,346,1,1998-02-02 05:33:16,0


#### Generate Implicit Feedback Version

In [8]:
ratings_implicit = ratings_explicit[ratings_explicit.positive_feedback == 1].reset_index(drop=True)
ratings_implicit.head()

Unnamed: 0,user_id,item_id,rating,timestamp,positive_feedback
0,253,465,5,1998-04-03 18:34:27,1
1,286,1014,5,1997-11-17 15:38:45,1
2,200,222,5,1997-10-05 09:05:40,1
3,224,29,3,1998-02-21 23:40:57,1
4,122,387,5,1997-11-11 17:47:39,1


#### Print Final Matrix Shapes
* NOTE: consider some way to include these movies that literally nobody rated above average in the sampling procedure

In [9]:
print(ratings_explicit.user_id.nunique(), ratings_explicit.item_id.nunique())
print(ratings_implicit.user_id.nunique(), ratings_implicit.item_id.nunique())

print(ratings_explicit.shape)
print(ratings_implicit.shape)

943 1682
943 1483
(100000, 5)
(54194, 5)


### Define Helper/Utility Functions

In [10]:
def sample_tuple(u, i, items, user_items):
    """sample a single (u, i, j) tuple
    
    :param u: user_id
    :param i: item_id
    :param items: set of all items 
    :param user_items: dictionary where keys are user_id and values are sets of interacted items
    :return: (u, i, j) tuple where item (j) is randomly sampled from non-interacted items
    """
    
    return u, i, np.random.choice(list(items - user_items[u]))  


def sample_tuples(interactions, items, user_items, num_threads=mp.cpu_count()):
    """sample a (u, i, j) tuple for every observed interaction
    
    :param interactions: ndarray of observed user/item interactions  
    :param items: set of all items 
    :param user_items: dictionary where keys are user_id and values are sets of interacted items
    :param num_threads: number of parallel computation threads to use
    :return: list of sampled (u, i, j) tuples with one tuple for each observed (u, i) interaction   
    """
    
    pool = mp.Pool(processes=num_threads)
    mp_sample_tuple = partial(sample_tuple, items=items, user_items=user_items)
    samples = pool.starmap(mp_sample_tuple, zip(interactions.iloc[:, 0], interactions.iloc[:, 1]))
    pool.close()
    pool.join()
    return samples

### Define Evaluation Metric Functions

In [11]:
def precision_at_k(model, test_interactions, k=10):
    """evaluate precision wrt out-of-sample observed interactions
    
    :param model: trained RankFM model instance
    :param test_interactions: pandas dataframe of out-of-sample observed user/item interactions  
    :param k: number of recommendations to generate for each user
    :return: overall average precision (scalar)
    """
    
    test_users = np.unique(test_interactions.iloc[:, 0])
    train_users = model.user_idx.map(model.index_to_user)
    common_users = list(set(test_users) & set(train_users))
    
    test_user_items = pd.DataFrame(test_interactions.copy(), columns=['user_id', 'item_id'])
    common_user_items = test_user_items[test_user_items.user_id.isin(common_users)].groupby('user_id')['item_id'].apply(set).to_dict()
    
    recommendations = model.recommend_for_users(common_users)
    precision = np.mean([len(set(recommendations.loc[key]) & set(val)) / len(set(recommendations.loc[key])) for key, val in common_user_items.items()])
    return precision


def recall_at_k(model, test_interactions, k=10):
    """evaluate recall wrt out-of-sample observed interactions
    
    :param model: trained RankFM model instance
    :param test_interactions: pandas dataframe of out-of-sample observed user/item interactions  
    :param k: number of recommendations to generate for each user
    :return: overall average recall (scalar)
    """
    
    test_users = np.unique(test_interactions.iloc[:, 0])
    train_users = model.user_idx.map(model.index_to_user)
    common_users = list(set(test_users) & set(train_users))
    
    test_user_items = pd.DataFrame(test_interactions.copy(), columns=['user_id', 'item_id'])
    common_user_items = test_user_items[test_user_items.user_id.isin(common_users)].groupby('user_id')['item_id'].apply(set).to_dict()
    
    recommendations = model.recommend_for_users(common_users)
    recall = np.mean([len(set(recommendations.loc[key]) & set(val)) / len(set(val)) for key, val in common_user_items.items()])
    return recall

### Define Main Model Class

In [12]:
class RankFM():
    """Factorization Machines for Ranking Problems with Implicit Feedback Data"""
    
    def __init__(self, factors=10, learning_rate=0.1, regularization=0.01, sigma=0.1):
        """store hyperparameters and initialize internal data
        
        :param factors: latent factor rank
        :param learning_rate: learning rate for gradient step weight updates
        :param regularization: L2 regularization penalty on model weights
        :param sigma: standard deviation to use for random initialization of factor weights
        :return: None
        """
        
        # model hyperparameters
        self.factors = factors
        self.learning_rate = learning_rate
        self.regularization = regularization
        self.sigma = sigma
        
        # user/item identifiers <--> index positions mappings
        self.index_to_user = None
        self.index_to_item = None
        self.user_to_index = None
        self.item_to_index = None
        
        # pandas raw interactions and unique sets of users, items, user_items
        self.interactions = None
        self.user_idx = None
        self.item_idx = None
        self.user_items = None
        
        # number of unique users/items
        self.n_users = None
        self.n_items = None

        # model weights
        self.w_item = None
        self.v_user = None
        self.v_item = None
        

    def _initialize(self, interactions):
        """create bi-directional raw user/item identifiers to zero-based index positions mappings and initialize model weights
        
        :param interactions: pandas dataframe of observed user/item interactions
        :return: None
        """
          
        # copy the raw interactions data into a pandas dataframe for internal use
        self.interactions = pd.DataFrame(interactions.copy(), columns=['user_id', 'item_id'])

        # create zero-based index position to identifier mappings
        self.index_to_user = pd.Series(np.sort(np.unique(self.interactions['user_id']))).to_dict()
        self.index_to_item = pd.Series(np.sort(np.unique(self.interactions['item_id']))).to_dict()
        
        # create reverse mappings from identifiers to zero-based index positions
        self.user_to_index = {val: key for key, val in self.index_to_user.items()}
        self.item_to_index = {val: key for key, val in self.index_to_item.items()}
        
        # convert the internal interactions data from identifiers to index positions
        self.interactions['user_id'] = self.interactions['user_id'].map(self.user_to_index)
        self.interactions['item_id'] = self.interactions['item_id'].map(self.item_to_index)
        self.interactions = self.interactions.astype('int32').rename({'user_id': 'user_idx', 'item_id': 'item_idx'}, axis=1)
        
        # store unique values of user/item indexes and observed interactions for each user
        self.user_idx = pd.Series(np.sort(np.unique(self.interactions['user_idx'])))
        self.item_idx = pd.Series(np.sort(np.unique(self.interactions['item_idx'])))
        self.user_items = self.interactions.groupby('user_idx')['item_idx'].apply(set).to_dict()
        
        # store the total cardinality of users/items
        self.n_users = len(self.user_idx)
        self.n_items = len(self.item_idx)
        
        # initialize the model weights as ndarrays
        self.w_item = np.zeros(self.n_items)
        self.v_user = np.random.normal(loc=0, scale=self.sigma, size=(self.n_users, self.factors))
        self.v_item = np.random.normal(loc=0, scale=self.sigma, size=(self.n_items, self.factors))
        
        
    def _pointwise_utility(self, u, i):
        """calculate the pointwise utility of a given (user, item) pair
        
        :param u: user index
        :param i: item index
        :return: scalar utility
        """
        
        utility = self.w_item[i] + np.dot(self.v_user[u], self.v_item[i])
        return utility
    
    
    def _pointwise_utilities(self, u):
        """calculate the pointwise utilities of all items for a given user
        
        :param u: user index
        :return: vector of utilities for all items by index position
        """
        
        utilities = self.w_item + np.dot(self.v_item, self.v_user[u])
        return utilities


    def _pairwise_utility(self, sample):
        """calculate the pairwise utility of an (u, i, j) sample
        
        :param sample: (u, i, j) index position tuple
        :return: pairwise utility score: utility(u, i) - utility(u, j)
        """

        u, i, j = sample
        utility = self.w_item[i] - self.w_item[j] + np.dot(self.v_user[u], self.v_item[i] - self.v_item[j])
        return utility


    def _gradient_step(self, sample):
        """update current model weights based on the gradient of the log-likelihood function
        
        :param sample: (u, i, j) index position tuple
        :return: None
        """

        # calculate the pairwise utility of the (u, i, j) pair
        u, i, j = sample
        utility = self._pairwise_utility(sample)

        # calculate the outer derivative [d_LL/d_g(theta)] and the regularization derivative [d_pen/d_theta]
        d_con = (1 / (1 + np.exp(utility)))
        d_pen = 2 * self.regularization

        # calculate the inner derivatives [d_g(theta)/d_theta]
        d_w_i = 1
        d_w_j = -1
        d_v_u = self.v_item[i] - self.v_item[j]
        d_v_i = self.v_user[u]
        d_v_j = -self.v_user[u]

        # perform the gradient updates to the relevant model weights
        self.w_item[i] += self.learning_rate * ((d_con * d_w_i) - (d_pen * self.w_item[i]))
        self.w_item[j] += self.learning_rate * ((d_con * d_w_j) - (d_pen * self.w_item[j]))
        self.v_user[u] += self.learning_rate * ((d_con * d_v_u) - (d_pen * self.v_user[u]))
        self.v_item[i] += self.learning_rate * ((d_con * d_v_i) - (d_pen * self.v_item[i]))
        self.v_item[j] += self.learning_rate * ((d_con * d_v_j) - (d_pen * self.v_item[j]))
        
    
    def _log_likelihood(self, samples):
        """calculate the regularized log-likelihood of all sample pairs
        
        :param samples: list of (u, i, j) index position tuples
        :return: current penalized log-likelihood (scalar)
        """

        likelihood = sum(np.log(1 / (1 + np.exp(-self._pairwise_utility(sample)))) for sample in samples)
        penalty = sum([np.sum(self.regularization * np.square(w)) for w in [self.w_item, self.v_user, self.v_item]])
        return likelihood - penalty
    
    
    def fit(self, interactions, epochs=1, verbose=False):
        """train model weights using the interaction data
        
        :param interactions: pandas dataframe of observed user/item interactions
        :param epochs: number of training epochs (full passes through observed interactions)
        :param verbose: whether to print epoch number and log-likelihood during training
        :return: self
        """
        
        # copy the raw interaction data into an internal pandas dataframe
        # and map original user/item identifers to zero-based index positions
        self._initialize(interactions)
        
        for epoch in range(epochs):
            
            shuffle_index = np.arange(self.interactions.shape[0])
            np.random.shuffle(shuffle_index)
            
            shuffle_samples = sample_tuples(self.interactions.iloc[shuffle_index], set(self.item_idx), self.user_items)
            for sample in shuffle_samples:
                self._gradient_step(sample) 
                
            if verbose:
                ll = self._log_likelihood(shuffle_samples)
                print("\ntraining epoch: {}".format(epoch))
                print("penalized log-likelihood: {}".format(round(ll, 2)))
                
        # return a reference to the trained model object for chained method calls
        return self
                
                
    def predict(self, interactions):
        """calculate the predicted pointwise utilities for all (user, item) pairs
        
        :param interactions: pandas dataframe of user/item pairs for which to generate utility scores
        :return: vector of utility scores corresponding to input user/item pairs
        """
        
        user_idx = interactions.iloc[:, 0].map(self.user_to_index)
        item_idx = interactions.iloc[:, 1].map(self.item_to_index)
        
        scores = np.array([self._pointwise_utility(user, item) for user, item in zip(user_idx, item_idx)])
        return scores
                
            
    def recommend_for_users(self, users, n_items=10):
        """calculate the topN items for each user
        
        :param users: list of user identifiers for which to generate recommendations
        :param n_items: number of recommended items to generate for each user
        :return: pandas dataframe where the index values are user identifiers and the columns are recommended items 
        """

        user_idx = pd.Series(users).map(self.user_to_index).dropna().astype('int32')
        user_ids = user_idx.map(self.index_to_user)
        
        top_n = [pd.Series(self._pointwise_utilities(user)).sort_values(ascending=False)[:n_items].index.values for user in user_idx]
        top_n = pd.DataFrame(top_n, index=user_ids).apply(lambda c: c.map(self.index_to_item))
        return top_n
    
    

### Test Out the Model on the Example Data

#### Create (Training, Validation) Interactions for Evaluation

In [13]:
print(ratings_implicit.shape)
ratings_implicit.head()

(54194, 5)


Unnamed: 0,user_id,item_id,rating,timestamp,positive_feedback
0,253,465,5,1998-04-03 18:34:27,1
1,286,1014,5,1997-11-17 15:38:45,1
2,200,222,5,1997-10-05 09:05:40,1
3,224,29,3,1998-02-21 23:40:57,1
4,122,387,5,1997-11-11 17:47:39,1


In [14]:
test_pct = 0.25

train_mask = (ratings_implicit.index / ratings_implicit.shape[0]) <  (1 - test_pct)
valid_mask = (ratings_implicit.index / ratings_implicit.shape[0]) >= (1 - test_pct)

interactions_train = ratings_implicit[train_mask][['user_id', 'item_id']]
interactions_valid = ratings_implicit[valid_mask][['user_id', 'item_id']]

interactions_train.shape, interactions_valid.shape

((40646, 2), (13548, 2))

#### Initialize and Fit the Model

In [15]:
model = RankFM(factors=10, learning_rate=0.1, regularization=0.01, sigma=0.1)
model

<__main__.RankFM at 0x1c2d7573c8>

In [16]:
%%prun -l 10 

model.fit(interactions_train, epochs=10, verbose=True)


training epoch: 0
penalized log-likelihood: -15892.6

training epoch: 1
penalized log-likelihood: -14856.59

training epoch: 2
penalized log-likelihood: -13830.46

training epoch: 3
penalized log-likelihood: -13029.39

training epoch: 4
penalized log-likelihood: -11909.33

training epoch: 5
penalized log-likelihood: -10774.33

training epoch: 6
penalized log-likelihood: -9774.06

training epoch: 7
penalized log-likelihood: -8912.31

training epoch: 8
penalized log-likelihood: -8529.65

training epoch: 9
penalized log-likelihood: -8145.14
 

         2788045 function calls (2786776 primitive calls) in 27.491 seconds

   Ordered by: internal time
   List reduced from 757 to 10 due to restriction <10>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
     1348   12.861    0.010   12.861    0.010 {method 'acquire' of '_thread.lock' objects}
   406460    9.382    0.000   10.719    0.000 <ipython-input-12-7968853ca3be>:115(_gradient_step)
   812920    1.811    0.000    2.500    0.000 <ipython-input-12-7968853ca3be>:103(_pairwise_utility)
   406470    1.477    0.000    2.639    0.000 <ipython-input-12-7968853ca3be>:152(<genexpr>)
   812920    0.689    0.000    0.689    0.000 {built-in method numpy.dot}
        1    0.306    0.306   27.488   27.488 <ipython-input-12-7968853ca3be>:157(fit)
       10    0.198    0.020    0.201    0.020 pool.py:397(_map_async)
      360    0.140    0.000    0.140    0.000 {built-in method posix.waitpid}
       31    0.094    0.003    2.734    0.088 {built-in method builtins.su

#### Evaluate Model Performance

In [17]:
interactions_valid.head()

Unnamed: 0,user_id,item_id
40646,437,443
40647,712,1037
40648,903,1070
40649,151,629
40650,711,79


In [18]:
k = 10

In [19]:
model_pre = round(precision_at_k(model, interactions_valid, k=k), 3)
model_rec = round(recall_at_k(model, interactions_valid, k=k), 3)
print("model precision: {} model recall: {}".format(model_pre, model_rec))

model precision: 0.118 model recall: 0.123


#### Evaluate Pure Popularity Baseline

In [20]:
most_popular = interactions_train.groupby('item_id').size().sort_values(ascending=False)[:k]
most_popular

item_id
50     359
100    292
181    277
258    261
98     249
174    246
127    246
1      226
56     223
313    222
dtype: int64

In [21]:
test_user_items = interactions_valid.groupby('user_id')['item_id'].apply(set).to_dict()
test_recommends = model.recommend_for_users(list(test_user_items.keys()))
test_user_items = {key: val for key, val in test_user_items.items() if key in test_recommends.index}

base_pre = round(np.mean([len(set(most_popular.index) & set(val)) / len(set(most_popular.index)) for key, val in test_user_items.items()]), 3)
base_rec = round(np.mean([len(set(most_popular.index) & set(val)) / len(set(val))                for key, val in test_user_items.items()]), 3)

print("number of test users: {}".format(len(test_user_items)))
print("baseline precision: {} baseline recall: {}".format(base_pre, base_rec))

number of test users: 905
baseline precision: 0.098 baseline recall: 0.087


#### Spot-Check Some User Recommendations

In [27]:
test_users = list(test_user_items.keys())
recommendations = model.recommend_for_users(test_users, n_items=10)
recommendations.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
1,238,168,176,100,12,70,154,50,174,208
2,258,269,301,286,300,255,272,289,750,328
3,327,301,258,272,302,269,268,288,270,347
4,258,269,288,272,268,327,302,301,315,270
5,168,176,173,403,230,79,174,99,22,11


In [28]:
user_id = 6
user_recs = recommendations.loc[user_id]
user_item_names = item_names[item_names.item_id.isin(user_recs)].set_index('item_id').loc[user_recs]
user_item_names

Unnamed: 0_level_0,item_name
item_id,Unnamed: 1_level_1
134,Citizen Kane (1941)
474,Dr. Strangelove or: How I Learned to Stop Worr...
100,Fargo (1996)
197,"Graduate, The (1967)"
480,North by Northwest (1959)
357,One Flew Over the Cuckoo's Nest (1975)
479,Vertigo (1958)
483,Casablanca (1942)
462,Like Water For Chocolate (Como agua para choco...
603,Rear Window (1954)


### Look at the Movies with the Highest/Lowest Model Weights

In [31]:
item_weights = pd.DataFrame({'item_idx': np.arange(len(model.w_item)), 'w_item': model.w_item})
item_weights['item_id'] = item_weights['item_idx'].map(model.index_to_item)
item_weights = pd.merge(item_weights, item_names, on='item_id').sort_values('w_item', ascending=False)
item_weights = item_weights[['item_id', 'item_idx', 'item_name', 'w_item']]

#### Best Movies Ever?

In [32]:
item_weights.head(10)

Unnamed: 0,item_id,item_idx,item_name,w_item
49,50,49,Star Wars (1977),2.485691
99,100,99,Fargo (1996),2.428561
255,258,255,Contact (1997),2.363675
124,127,124,"Godfather, The (1972)",2.1516
285,288,285,Scream (1996),2.084864
171,174,171,Raiders of the Lost Ark (1981),2.019069
11,12,11,"Usual Suspects, The (1995)",2.007389
314,318,314,Schindler's List (1993),1.969794
21,22,21,Braveheart (1995),1.934108
178,181,178,Return of the Jedi (1983),1.892119


#### Worst Movies Ever?

In [33]:
item_weights.tail(10)

Unnamed: 0,item_id,item_idx,item_name,w_item
1404,1617,1404,Hugo Pool (1997),-1.776693
1382,1553,1382,"Underneath, The (1995)",-1.77886
652,666,652,Blood For Dracula (Andy Warhol's Dracula) (1974),-1.780362
1398,1605,1398,Love Serenade (1996),-1.797138
1268,1380,1268,Hollow Reed (1996),-1.806405
1313,1442,1313,"Scarlet Letter, The (1995)",-1.806676
1412,1639,1412,Bitter Sugar (Azucar Amargo) (1996),-1.817107
1411,1635,1411,Two Friends (1986),-1.822045
1278,1391,1278,For Ever Mozart (1996),-1.825682
958,996,958,"Big Green, The (1995)",-1.849108


# Start Sandbox Code