### Import Required Packages and Set Options

In [1]:
import os
import sys

import numpy as np
import pandas as pd
import multiprocessing as mp

import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime
from itertools import combinations
from functools import partial
from numpy.linalg import norm

import jax.numpy as jnp
from jax import grad, jit, vmap

In [2]:
REPO_ROOT = "/Users/ericlundquist/Repos/rankfm"
DATA_ROOT = os.path.join(REPO_ROOT, "data/ml-100k")

### Load/Prepare Example Data

#### Prepare Users Data

In [3]:
users_df = pd.read_csv(os.path.join(DATA_ROOT, "users.csv"))
users_df['agegroup'] = pd.cut(users_df['age'], [0, 30, 45, 100], right=False, labels=False)
users_df = users_df.drop(['age', 'zip_code'], axis=1)
users_df = pd.get_dummies(users_df, prefix_sep='__', columns=['agegroup', 'gender', 'occupation'])
users_df.mean()

user_id                      472.000000
agegroup__0                    0.433722
agegroup__1                    0.348887
agegroup__2                    0.217391
gender__F                      0.289502
gender__M                      0.710498
occupation__administrator      0.083775
occupation__artist             0.029692
occupation__doctor             0.007423
occupation__educator           0.100742
occupation__engineer           0.071050
occupation__entertainment      0.019088
occupation__executive          0.033934
occupation__healthcare         0.016967
occupation__homemaker          0.007423
occupation__lawyer             0.012725
occupation__librarian          0.054083
occupation__marketing          0.027572
occupation__none               0.009544
occupation__other              0.111347
occupation__programmer         0.069989
occupation__retired            0.014846
occupation__salesman           0.012725
occupation__scientist          0.032874
occupation__student            0.207847


#### Prepare Items Data

In [4]:
items_df = pd.read_csv(os.path.join(DATA_ROOT, "items.csv"))
item_names = items_df[['item_id', 'item_name']]
item_names.head()

Unnamed: 0,item_id,item_name
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [5]:
items_df = items_df.drop(['item_name', 'release_date'], axis=1)
items_df.columns = ['item_id'] + ["genre__{}".format(col) for col in items_df.columns[1:]]
items_df.mean()

item_id               841.500000
genre__action           0.149227
genre__adventure        0.080262
genre__animation        0.024970
genre__childrens        0.072533
genre__comedy           0.300238
genre__crime            0.064804
genre__documentary      0.029727
genre__drama            0.431034
genre__fantasy          0.013080
genre__film_noir        0.014269
genre__horror           0.054697
genre__musical          0.033294
genre__mystery          0.036266
genre__romance          0.146849
genre__scifi            0.060048
genre__thriller         0.149227
genre__war              0.042212
genre__western          0.016052
dtype: float64

#### Prepare Ratings Data

In [6]:
ratings_explicit = pd.read_csv(os.path.join(DATA_ROOT, "ratings.csv"))
ratings_explicit['timestamp'] = pd.to_datetime(ratings_explicit['unix_timestamp'], origin='unix', unit='s')
ratings_explicit['positive_feedback'] = ratings_explicit.groupby('user_id')['rating'].transform(lambda c: np.where(c > c.mean(), 1, 0))
ratings_explicit = ratings_explicit.drop('unix_timestamp', axis=1)
ratings_explicit.mean()

user_id              462.48475
item_id              425.53013
rating                 3.52986
positive_feedback      0.54194
dtype: float64

In [7]:
ratings_explicit.head()

Unnamed: 0,user_id,item_id,rating,timestamp,positive_feedback
0,196,242,3,1997-12-04 15:55:49,0
1,186,302,3,1998-04-04 19:22:22,0
2,22,377,1,1997-11-07 07:18:36,0
3,244,51,2,1997-11-27 05:02:03,0
4,166,346,1,1998-02-02 05:33:16,0


#### Generate Implicit Feedback Version

In [8]:
ratings_implicit = ratings_explicit[ratings_explicit.positive_feedback == 1].reset_index(drop=True)
ratings_implicit.head()

Unnamed: 0,user_id,item_id,rating,timestamp,positive_feedback
0,253,465,5,1998-04-03 18:34:27,1
1,286,1014,5,1997-11-17 15:38:45,1
2,200,222,5,1997-10-05 09:05:40,1
3,224,29,3,1998-02-21 23:40:57,1
4,122,387,5,1997-11-11 17:47:39,1


#### Print Final Matrix Shapes
* NOTE: consider some way to include these movies that literally nobody rated above average in the sampling procedure

In [9]:
print(ratings_explicit.user_id.nunique(), ratings_explicit.item_id.nunique())
print(ratings_implicit.user_id.nunique(), ratings_implicit.item_id.nunique())

print(ratings_explicit.shape)
print(ratings_implicit.shape)

943 1682
943 1483
(100000, 5)
(54194, 5)


### Define Helper/Utility Functions

In [41]:
def sample_tuple(u, i, items, user_items):
    """sample a single (u, i, j) tuple"""
    
    return u, i, np.random.choice(list(items - user_items[u]))  


def sample_tuples(interactions, items, user_items, processes=mp.cpu_count()):
    """sample a (u, i, j) tuple for every observed interaction"""
    
    pool = mp.Pool(processes=processes)
    mp_sample_tuple = partial(sample_tuple, items=items, user_items=user_items)
    samples = pool.starmap(mp_sample_tuple, zip(interactions.iloc[:, 0], interactions.iloc[:, 1]))
    pool.close()
    pool.join()
    return samples

### Define Evaluation Metric Functions

In [42]:
def precision_at_k(model, test_interactions, k=10):
    """evaluate precision wrt test-set observed interactions"""
    
    test_user_items = test_interactions.groupby(test_interactions.columns[0])[test_interactions.columns[1]].apply(list).to_dict()
    test_recommends = model.recommend_for_users(list(test_user_items.keys()))
    test_user_items = {key: val for key, val in test_user_items.items() if key in test_recommends.index}
    
    precision = np.mean([len(set(test_recommends.loc[key]) & set(val)) / len(set(test_recommends.loc[key])) for key, val in test_user_items.items()])
    return precision


def recall_at_k(model, test_interactions, k=10):
    """evaluate recall wrt test-set observed interactions"""
    
    test_user_items = test_interactions.groupby(test_interactions.columns[0])[test_interactions.columns[1]].apply(list).to_dict()
    test_recommends = model.recommend_for_users(list(test_user_items.keys()))
    test_user_items = {key: val for key, val in test_user_items.items() if key in test_recommends.index}

    recall = np.mean([len(set(test_recommends.loc[key]) & set(val)) / len(set(val)) for key, val in test_user_items.items()])
    return recall

### Define Main Model Class

In [43]:
class RankFM():
    """Factorization Machines for Ranking Problems with Implicit Feedback Data"""
    
    def __init__(self, factors=10, learning_rate=0.1, regularization=0.01, sigma=0.1):
        """store hyperparameters and initialize internal data"""
        
        # model hyperparameters
        self.factors = factors
        self.learning_rate = learning_rate
        self.regularization = regularization
        self.sigma = sigma
        
        # user/item identifiers <--> index positions mappings
        self.index_to_user = None
        self.index_to_item = None
        self.user_to_index = None
        self.item_to_index = None
        
        # unique sets of [users, items, user_items]
        self.interactions = None
        self.user_idx = None
        self.item_idx = None
        self.user_items = None
        
        # number of unique users/items
        self.n_users = None
        self.n_items = None

        # model weights
        self.w_item = None
        self.v_user = None
        self.v_item = None
        

    def initialize(self, interactions):
        """create raw user/item identifier to zero-based index position mappings"""

        self.index_to_user = pd.Series(np.sort(np.unique(interactions.iloc[:, 0]))).to_dict()
        self.index_to_item = pd.Series(np.sort(np.unique(interactions.iloc[:, 1]))).to_dict()
        
        self.user_to_index = {val:key for key, val in self.index_to_user.items()}
        self.item_to_index = {val:key for key, val in self.index_to_item.items()}
        
        self.interactions = interactions.copy()
        self.interactions.iloc[:, 0] = self.interactions.iloc[:, 0].map(self.user_to_index)
        self.interactions.iloc[:, 1] = self.interactions.iloc[:, 1].map(self.item_to_index)
        
        self.user_idx = set(self.interactions.iloc[:, 0])
        self.item_idx = set(self.interactions.iloc[:, 1])
        self.user_items = self.interactions.groupby(self.interactions.columns[0])[self.interactions.columns[1]].apply(set).to_dict()
        
        self.n_users = len(self.user_idx)
        self.n_items = len(self.item_idx)
        
        self.w_item = np.zeros(self.n_items)
        self.v_user = np.random.normal(loc=0, scale=self.sigma, size=(self.n_users, self.factors))
        self.v_item = np.random.normal(loc=0, scale=self.sigma, size=(self.n_items, self.factors))


    def pairwise_utility(self, sample):
        """calculate pairwise utility of a sample given current weights"""

        u, i, j = sample
        utility = self.w_item[i] - self.w_item[j] + np.dot(self.v_user[u], self.v_item[i] - self.v_item[j])
        return utility


    def update_weights(self, sample):
        """update current weights based on gradient of log-likelihood function"""

        u, i, j = sample
        utility = self.pairwise_utility(sample)

        d_con = (1 / (1 + np.exp(utility)))
        d_pen = 2 * self.regularization

        d_w_i = 1
        d_w_j = -1
        d_v_u = self.v_item[i] - self.v_item[j]
        d_v_i = self.v_user[u]
        d_v_j = -self.v_user[u]

        self.w_item[i] += self.learning_rate * ((d_con * d_w_i) - (d_pen * self.w_item[i]))
        self.w_item[j] += self.learning_rate * ((d_con * d_w_j) - (d_pen * self.w_item[j]))
        self.v_user[u] += self.learning_rate * ((d_con * d_v_u) - (d_pen * self.v_user[u]))
        self.v_item[i] += self.learning_rate * ((d_con * d_v_i) - (d_pen * self.v_item[i]))
        self.v_item[j] += self.learning_rate * ((d_con * d_v_j) - (d_pen * self.v_item[j]))
        
    
    def log_likelihood(self, samples):
        """calculate the regularized log-likelihood of all sample pairs given current weights"""

        likelihood = sum(np.log(1 / (1 + np.exp(-self.pairwise_utility(sample)))) for sample in samples)
        penalty = sum([np.sum(self.regularization * np.square(w)) for w in [self.w_item, self.v_user, self.v_item]])
        return likelihood - penalty
    
    
    def fit(self, interactions, epochs=1):
        """train model weights using the interaction data"""
        
        self.initialize(interactions)
        
        for epoch in range(epochs):
            
            print("\ntraining epoch: {}".format(epoch))
            shuffle_index = np.arange(self.interactions.shape[0])
            np.random.shuffle(shuffle_index)
            
            shuffle_samples = sample_tuples(self.interactions.iloc[shuffle_index], self.item_idx, self.user_items)
            for sample in shuffle_samples:
                self.update_weights(sample) 
                
            ll = self.log_likelihood(shuffle_samples)
            print("penalized log-likelihood: {}".format(round(ll, 2)))
                
                
    def predict(self, interactions):
        """calculate the predicted utility for a given (user, item) pair"""
        
        pairs = zip(interactions.iloc[:, 0].map(self.user_to_index), interactions.iloc[:, 1].map(self.item_to_index))
        scores = np.array([self.w_item[item] + np.dot(self.v_user[user], self.v_item[item]) for user, item in pairs])
        return scores
                
            
    def recommend_for_users(self, users, n_items=10):
        """calculate the topN items for each user"""

        user_idx = pd.Series(users).map(self.user_to_index).dropna().astype('int')
        user_ids = user_idx.map(self.index_to_user)
        
        top_n = [pd.Series(self.w_item + np.dot(self.v_item, self.v_user[user])).sort_values(ascending=False)[:n_items].index.values for user in user_idx]
        top_n = pd.DataFrame(top_n, index=user_ids).apply(lambda c: c.map(self.index_to_item))
        return top_n

    

### Test Out the Model on the Example Data

#### Create (Training, Validation) Interactions for Evaluation

In [44]:
print(ratings_implicit.shape)
ratings_implicit.head()

(54194, 5)


Unnamed: 0,user_id,item_id,rating,timestamp,positive_feedback
0,253,465,5,1998-04-03 18:34:27,1
1,286,1014,5,1997-11-17 15:38:45,1
2,200,222,5,1997-10-05 09:05:40,1
3,224,29,3,1998-02-21 23:40:57,1
4,122,387,5,1997-11-11 17:47:39,1


In [47]:
test_pct = 0.25

train_mask = (ratings_implicit.index / ratings_implicit.shape[0]) <  (1 - test_pct)
valid_mask = (ratings_implicit.index / ratings_implicit.shape[0]) >= (1 - test_pct)

interactions_train = ratings_implicit[train_mask][['user_id', 'item_id']]
interactions_valid = ratings_implicit[valid_mask][['user_id', 'item_id']]

interactions_train.shape, interactions_valid.shape

((40646, 2), (13548, 2))

In [48]:
interactions_valid.dtypes

user_id    int64
item_id    int64
dtype: object

#### Initialize and Fit the Model

In [49]:
%%prun -l 10 

model = RankFM(factors=10, learning_rate=0.1, regularization=0.01)
model.fit(interactions_train, epochs=10)


training epoch: 0
penalized log-likelihood: -15669.9

training epoch: 1
penalized log-likelihood: -14455.72

training epoch: 2
penalized log-likelihood: -14015.59

training epoch: 3
penalized log-likelihood: -13115.29

training epoch: 4
penalized log-likelihood: -11771.33

training epoch: 5
penalized log-likelihood: -10580.99

training epoch: 6
penalized log-likelihood: -9728.26

training epoch: 7
penalized log-likelihood: -8807.93

training epoch: 8
penalized log-likelihood: -8386.25

training epoch: 9
penalized log-likelihood: -7793.47
 

         2784642 function calls (2783410 primitive calls) in 26.917 seconds

   Ordered by: internal time
   List reduced from 616 to 10 due to restriction <10>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
     1349   12.559    0.009   12.559    0.009 {method 'acquire' of '_thread.lock' objects}
   406460    9.119    0.000   10.440    0.000 <ipython-input-43-7da0fb5df88d>:68(update_weights)
   812920    1.747    0.000    2.480    0.000 <ipython-input-43-7da0fb5df88d>:60(pairwise_utility)
   406470    1.446    0.000    2.605    0.000 <ipython-input-43-7da0fb5df88d>:93(<genexpr>)
   812920    0.733    0.000    0.733    0.000 {built-in method numpy.dot}
        1    0.320    0.320   26.911   26.911 <ipython-input-43-7da0fb5df88d>:98(fit)
       10    0.153    0.015    0.156    0.016 pool.py:397(_map_async)
      360    0.146    0.000    0.146    0.000 {built-in method posix.waitpid}
       80    0.116    0.001    0.149    0.002 {built-in method posix.fork}
    

#### Evaluate Model Performance

In [50]:
k = 10

In [51]:
model_pre = round(precision_at_k(model, interactions_valid, k=k), 3)
model_rec = round(recall_at_k(model, interactions_valid, k=k), 3)
print("model precision: {} model recall: {}".format(model_pre, model_rec))

model precision: 0.122 model recall: 0.13


#### Evaluate Pure Popularity Baseline

In [52]:
most_popular = interactions_train.groupby('item_id').size().sort_values(ascending=False)[:k]
most_popular

item_id
50     359
100    292
181    277
258    261
98     249
174    246
127    246
1      226
56     223
313    222
dtype: int64

In [53]:
test_user_items = interactions_valid.groupby(interactions_valid.columns[0])[interactions_valid.columns[1]].apply(list).to_dict()
test_recommends = model.recommend_for_users(list(test_user_items.keys()))
test_user_items = {key: val for key, val in test_user_items.items() if key in test_recommends.index}

base_pre = round(np.mean([len(set(most_popular.index) & set(val)) / len(set(most_popular.index)) for key, val in test_user_items.items()]), 3)
base_rec = round(np.mean([len(set(most_popular.index) & set(val)) / len(set(val)) for key, val in test_user_items.items()]), 3)

print("number of test users: {}".format(len(test_user_items)))
print("baseline precision: {} baseline recall: {}".format(base_pre, base_rec))

number of test users: 905
baseline precision: 0.098 baseline recall: 0.087


#### Spot-Check Some Top Recommendations

In [54]:
test_users = list(test_user_items.keys())
recommendations = model.recommend_for_users(test_users, n_items=10)
recommendations.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
1,89,100,127,12,50,168,154,209,174,98
2,302,258,313,307,328,272,286,300,269,288
3,690,326,328,292,310,313,245,307,333,347
4,288,258,313,326,690,328,302,333,310,300
5,168,195,96,69,230,588,210,121,568,22


In [55]:
user_id = 6
item_names[item_names.item_id.isin(recommendations.loc[user_id])]

Unnamed: 0,item_id,item_name
126,127,"Godfather, The (1972)"
190,191,Amadeus (1984)
196,197,"Graduate, The (1967)"
210,211,M*A*S*H (1970)
426,427,To Kill a Mockingbird (1962)
478,479,Vertigo (1958)
479,480,North by Northwest (1959)
482,483,Casablanca (1942)
510,511,Lawrence of Arabia (1962)
602,603,Rear Window (1954)


### Look at the Highest/Lowest Model Item Weights

In [56]:
item_weights = pd.DataFrame({'item_idx': np.arange(len(model.w_item)), 'w_item': model.w_item})
item_weights['item_id'] = item_weights['item_idx'].map(model.index_to_item)
item_weights = pd.merge(item_weights, item_names, on='item_id').sort_values('w_item', ascending=False)

#### Best Movies Ever?

In [57]:
item_weights.head(10)

Unnamed: 0,item_idx,w_item,item_id,item_name
49,49,2.595656,50,Star Wars (1977)
99,99,2.44535,100,Fargo (1996)
178,178,2.274072,181,Return of the Jedi (1983)
124,124,2.256486,127,"Godfather, The (1972)"
97,97,2.249431,98,"Silence of the Lambs, The (1991)"
255,255,2.225727,258,Contact (1997)
0,0,2.136893,1,Toy Story (1995)
11,11,2.045175,12,"Usual Suspects, The (1995)"
285,285,2.004289,288,Scream (1996)
169,169,2.000158,172,"Empire Strikes Back, The (1980)"


#### Worst Movies Ever?

In [58]:
item_weights.tail(10)

Unnamed: 0,item_idx,w_item,item_id,item_name
654,654,-1.756699,668,Blood Beach (1981)
1084,1084,-1.7576,1130,Jupiter's Wife (1994)
34,34,-1.7583,35,Free Willy 2: The Adventure Home (1995)
1410,1410,-1.762835,1633,¡ kˆldum klaka (Cold Fever) (1994)
1323,1323,-1.765077,1458,"Damsel in Distress, A (1937)"
1326,1326,-1.767462,1463,"Boys, Les (1997)"
1349,1349,-1.768889,1498,Farmer & Chase (1995)
1321,1321,-1.775678,1452,Lady of Burlesque (1943)
586,586,-1.777043,600,Daniel Defoe's Robinson Crusoe (1996)
1381,1381,-1.866271,1550,Destiny Turns on the Radio (1995)


# Start Sandbox Code