### Import Required Packages and Set Options

In [45]:
import os
import sys

import numpy as np
import pandas as pd
import multiprocessing as mp

import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime
from itertools import combinations
from functools import partial
from numpy.linalg import norm

import jax.numpy as jnp
from jax import grad, jit, vmap

In [2]:
REPO_ROOT = "/Users/ericlundquist/Repos/rankfm"
DATA_ROOT = os.path.join(REPO_ROOT, "data/ml-100k")

### Load/Prepare Example Data

#### Prepare Users Data

In [3]:
users_df = pd.read_csv(os.path.join(DATA_ROOT, "users.csv"))
users_df['agegroup'] = pd.cut(users_df['age'], [0, 30, 45, 100], right=False, labels=False)
users_df = users_df.drop(['age', 'zip_code'], axis=1)
users_df = pd.get_dummies(users_df, prefix_sep='__', columns=['agegroup', 'gender', 'occupation'])
users_df.mean()

user_id                      472.000000
agegroup__0                    0.433722
agegroup__1                    0.348887
agegroup__2                    0.217391
gender__F                      0.289502
gender__M                      0.710498
occupation__administrator      0.083775
occupation__artist             0.029692
occupation__doctor             0.007423
occupation__educator           0.100742
occupation__engineer           0.071050
occupation__entertainment      0.019088
occupation__executive          0.033934
occupation__healthcare         0.016967
occupation__homemaker          0.007423
occupation__lawyer             0.012725
occupation__librarian          0.054083
occupation__marketing          0.027572
occupation__none               0.009544
occupation__other              0.111347
occupation__programmer         0.069989
occupation__retired            0.014846
occupation__salesman           0.012725
occupation__scientist          0.032874
occupation__student            0.207847


#### Prepare Items Data

In [4]:
items_df = pd.read_csv(os.path.join(DATA_ROOT, "items.csv"))
item_names = items_df[['item_id', 'item_name']]
item_names.head()

Unnamed: 0,item_id,item_name
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [5]:
items_df = items_df.drop(['item_name', 'release_date'], axis=1)
items_df.columns = ['item_id'] + ["genre__{}".format(col) for col in items_df.columns[1:]]
items_df.mean()

item_id               841.500000
genre__action           0.149227
genre__adventure        0.080262
genre__animation        0.024970
genre__childrens        0.072533
genre__comedy           0.300238
genre__crime            0.064804
genre__documentary      0.029727
genre__drama            0.431034
genre__fantasy          0.013080
genre__film_noir        0.014269
genre__horror           0.054697
genre__musical          0.033294
genre__mystery          0.036266
genre__romance          0.146849
genre__scifi            0.060048
genre__thriller         0.149227
genre__war              0.042212
genre__western          0.016052
dtype: float64

#### Prepare Ratings Data

In [6]:
ratings_explicit = pd.read_csv(os.path.join(DATA_ROOT, "ratings.csv"))
ratings_explicit['timestamp'] = pd.to_datetime(ratings_explicit['unix_timestamp'], origin='unix', unit='s')
ratings_explicit['positive_feedback'] = ratings_explicit.groupby('user_id')['rating'].transform(lambda c: np.where(c > c.mean(), 1, 0))
ratings_explicit = ratings_explicit.drop('unix_timestamp', axis=1)
ratings_explicit.mean()

user_id              462.48475
item_id              425.53013
rating                 3.52986
positive_feedback      0.54194
dtype: float64

In [7]:
ratings_explicit.head()

Unnamed: 0,user_id,item_id,rating,timestamp,positive_feedback
0,196,242,3,1997-12-04 15:55:49,0
1,186,302,3,1998-04-04 19:22:22,0
2,22,377,1,1997-11-07 07:18:36,0
3,244,51,2,1997-11-27 05:02:03,0
4,166,346,1,1998-02-02 05:33:16,0


#### Generate Implicit Feedback Version

In [8]:
ratings_implicit = ratings_explicit[ratings_explicit.positive_feedback == 1][['user_id', 'item_id']].reset_index(drop=True)
ratings_implicit.head()

Unnamed: 0,user_id,item_id
0,253,465
1,286,1014
2,200,222
3,224,29
4,122,387


#### Print Final Matrix Shapes
* NOTE: consider some way to include these movies that literally nobody rated above average in the sampling procedure

In [11]:
print(ratings_explicit.user_id.nunique(), ratings_explicit.item_id.nunique())
print(ratings_implicit.user_id.nunique(), ratings_implicit.item_id.nunique())

print(ratings_explicit.shape)
print(ratings_implicit.shape)

943 1682
943 1483
(100000, 5)
(54194, 2)


### Define Helper/Utility Functions

In [13]:
def sample_tuple(u, i, items, user_items):
    """sample a single (u, i, j) tuple"""
    
    return u, i, np.random.choice(list(items - user_items[u]))    


def sample_tuples(interactions, items, user_items, n_jobs=mp.cpu_count()):
    """sample a (u, i, j) tuple for every observed interaction"""

    pool = mp.Pool(processes=n_jobs)
    mp_sample_tuple = partial(sample_tuple, items=items, user_items=user_items)
    samples = pool.starmap(mp_sample_tuple, zip(interactions.iloc[:, 0], interactions.iloc[:, 1]))
    pool.close()
    pool.join()
    return samples

### Define Main Model Class

In [87]:
class RankFM():
    """Factorization Machines for Ranking Problems with Implicit Feedback Data"""
    
    def __init__(self, factors=10, learning_rate=0.01, regularization=0.1, sigma=0.1):
        """store hyperparameters and initialize internal data"""
        
        # hyperparameters
        self.factors = factors
        self.learning_rate = learning_rate
        self.regularization = regularization
        self.sigma = sigma
        
        # user/item identifier <--> index position mappings
        self.index_to_user = None
        self.index_to_item = None
        self.user_to_index = None
        self.item_to_index = None
        
        # store the unique sets of users, items, and user_items
        self.interactions = None
        self.user_idx = None
        self.item_idx = None
        self.user_items = None
        
        # number of unique users/items
        self.n_users = None
        self.n_items = None

        # model weights
        self.w_item = None
        self.v_user = None
        self.v_item = None
        

    def initialize(self, interactions):
        """create raw user/item identifier to zero-based index position mappings"""

        # create mappings of index positions to raw user/item identifiers
        self.index_to_user = pd.Series(np.sort(np.unique(interactions.iloc[:, 0]))).to_dict()
        self.index_to_item = pd.Series(np.sort(np.unique(interactions.iloc[:, 1]))).to_dict()
        
        # create mappings of raw user/item identifiers to index positions
        self.user_to_index = {val:key for key, val in self.index_to_user.items()}
        self.item_to_index = {val:key for key, val in self.index_to_item.items()}
        
        # store a copy of the interaction data with identifiers mapped to index positions
        self.interactions = interactions.copy()
        self.interactions.iloc[:, 0] = self.interactions.iloc[:, 0].map(self.user_to_index)
        self.interactions.iloc[:, 1] = self.interactions.iloc[:, 1].map(self.item_to_index)
        
        # store unique user_idx, item_idx, and user_items
        self.user_idx = set(self.interactions.iloc[:, 0])
        self.item_idx = set(self.interactions.iloc[:, 1])
        self.user_items = self.interactions.groupby(self.interactions.columns[0])[self.interactions.columns[1]].apply(set).to_dict()
        
        # store the number of unique users/items
        self.n_users = len(self.user_idx)
        self.n_items = len(self.item_idx)
        
        # initialize model weights
        self.w_item = np.zeros(self.n_items)
        self.v_user = np.random.normal(loc=0, scale=self.sigma, size=(self.n_users, self.factors))
        self.v_item = np.random.normal(loc=0, scale=self.sigma, size=(self.n_items, self.factors))


    def pointwise_utility(self, user, item):
        """calculate the predicted utility for a given (user, item) pair"""

        utility = self.w_item[item] + np.dot(self.v_user[user], self.v_item[item])
        return utility


    def pairwise_utility(self, sample):
        """calculate pairwise utility of a sample given current weights"""

        u, i, j = sample
        utility = self.w_item[i] - self.w_item[j] + np.dot(self.v_user[u], self.v_item[i] - self.v_item[j])
        return utility


    def log_likelihood(self, sample):
        """calculate the regularized log-likelihood of a sample given current weights"""

        utility = self.pairwise_utility(sample)
        likelihood = np.log(1 / (1 + np.exp(-utility)))
        penalty = sum([np.sum(self.regularization * np.square(w)) for w in [self.w_item, self.v_user, self.v_item]])
        return likelihood - penalty


    def update_weights(self, sample):
        """update model weights based on gradient of log-likelihood function"""

        u, i, j = sample
        utility = self.pairwise_utility(sample)

        d_con = (1 / (1 + np.exp(utility)))
        d_pen = 2 * self.regularization

        d_w_i = 1
        d_w_j = -1
        d_v_u = self.v_item[i] - self.v_item[j]
        d_v_i = self.v_user[u]
        d_v_j = -self.v_user[u]

        self.w_item[i] += self.learning_rate * ((d_con * d_w_i) - (d_pen * self.w_item[i]))
        self.w_item[j] += self.learning_rate * ((d_con * d_w_j) - (d_pen * self.w_item[j]))
        self.v_user[u] += self.learning_rate * ((d_con * d_v_u) - (d_pen * self.v_user[u]))
        self.v_item[i] += self.learning_rate * ((d_con * d_v_i) - (d_pen * self.v_item[i]))
        self.v_item[j] += self.learning_rate * ((d_con * d_v_j) - (d_pen * self.v_item[j]))
    
    
    def fit(self, interactions, n_epochs=1):
        """train model weights using the interaction data"""
        
        # initialize all internal data based on provided interaction data
        self.initialize(interactions)
        
        for epoch in range(n_epochs):
            
            # shuffle the interaction data and create (u, i, j) samples for training
            print("beginning training epoch: {}".format(epoch))
            shuffle_index = np.arange(self.interactions.shape[0])
            np.random.shuffle(shuffle_index)
            shuffle_samples = sample_tuples(self.interactions.iloc[shuffle_index], self.item_idx, self.user_items)

            # loop over the (u, i, j) samples to update model weights via SGD
            for sample in shuffle_samples:
                self.update_weights(sample)
                
            
    def recommend_for_users(self, users, n_items=10):
        """calculate the topN items for each user"""

        user_idx = pd.Series(users).map(self.user_to_index)
        top_n = [pd.Series(self.w_item + np.dot(self.v_item, self.v_user[user])).sort_values(ascending=False)[:n_items].index.values for user in user_idx]
        top_n = pd.DataFrame(top_n, index=users).apply(lambda c: c.map(self.index_to_item))
        return top_n

    

### Test Out the Model on the Example Data

#### Create Interaction Data

In [88]:
interactions = ratings_implicit[['user_id', 'item_id']]
print(interactions.shape)
interactions.head()

(54194, 2)


Unnamed: 0,user_id,item_id
0,253,465
1,286,1014
2,200,222
3,224,29
4,122,387


#### Initialize and Fit the Model

In [99]:
model = RankFM(factors=10, learning_rate=0.01, regularization=0.1)
model.fit(interactions, n_epochs=3)

beginning training epoch: 0
beginning training epoch: 1
beginning training epoch: 2


#### Generate Recommendations

In [100]:
users = interactions.user_id.unique()
len(users)

943

In [101]:
recommendations = model.recommend_for_users(users, n_items=10)
recommendations.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
253,50,100,172,1,127,98,174,258,181,313
286,50,100,172,1,98,127,174,258,181,313
200,50,100,1,172,98,127,174,258,181,56
224,50,100,172,1,98,127,174,258,181,56
122,50,100,172,1,127,98,174,258,181,313


#### Spot-Check Some Top Recommendations

In [102]:
item_names[item_names.item_id.isin([50, 100, 181, 174, 258, 127, 98, 1, 172, 56])]

Unnamed: 0,item_id,item_name
0,1,Toy Story (1995)
49,50,Star Wars (1977)
55,56,Pulp Fiction (1994)
97,98,"Silence of the Lambs, The (1991)"
99,100,Fargo (1996)
126,127,"Godfather, The (1972)"
171,172,"Empire Strikes Back, The (1980)"
173,174,Raiders of the Lost Ark (1981)
180,181,Return of the Jedi (1983)
257,258,Contact (1997)


#### Look at the Highest/Lowest Item Weights

In [103]:
item_weights = pd.DataFrame({'item_idx': np.arange(len(model.w_item)), 'w_item': model.w_item})
item_weights['item_id'] = item_weights['item_idx'].map(model.index_to_item)
item_weights = pd.merge(item_weights, item_names, on='item_id').sort_values('w_item', ascending=False)

#### Best Movies Ever?

In [104]:
item_weights.head()

Unnamed: 0,item_idx,w_item,item_id,item_name
49,49,1.065487,50,Star Wars (1977)
99,99,1.058731,100,Fargo (1996)
169,169,1.048541,172,"Empire Strikes Back, The (1980)"
0,0,1.04386,1,Toy Story (1995)
97,97,1.032312,98,"Silence of the Lambs, The (1991)"


#### Worst Movies Ever?

In [105]:
item_weights.tail()

Unnamed: 0,item_idx,w_item,item_id,item_name
1340,1340,-0.432035,1441,Moonlight and Valentino (1995)
1276,1276,-0.435619,1347,"Ballad of Narayama, The (Narayama Bushiko) (1958)"
820,820,-0.443175,839,Loch Ness (1995)
1273,1273,-0.44344,1342,"Convent, The (Convento, O) (1995)"
1372,1372,-0.456451,1481,S.F.W. (1994)


### Evaluate Hold-Out Metrics Relative to Random and Pure-Popularity Baselines

# Start Sandbox Code