### Import Required Packages and Set Options

In [1]:
import os
import sys

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime
from itertools import combinations
from functools import partial

import jax.numpy as jnp
from jax import grad, jit, vmap

from pandas.api.types import CategoricalDtype
from scipy.sparse import csr_matrix

from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
REPO_ROOT = "/Users/ericlundquist/Repos/rankfm"
DATA_ROOT = os.path.join(REPO_ROOT, "data/ml-100k")

### Load Example Data

In [3]:
users_df = pd.read_csv(os.path.join(DATA_ROOT, "users.csv"))
items_df = pd.read_csv(os.path.join(DATA_ROOT, "items.csv"))
ratings_df = pd.read_csv(os.path.join(DATA_ROOT, "ratings.csv"))

print("users: {} items: {} ratings: {}".format(users_df.shape, items_df.shape, ratings_df.shape))

users: (943, 5) items: (1682, 21) ratings: (100000, 4)


#### Prepare Users Data

In [4]:
users_df['agegroup'] = pd.cut(users_df['age'], [0, 30, 45, 100], right=False, labels=False)
users_df = users_df.drop(['age', 'zip_code'], axis=1)
users_df = pd.get_dummies(users_df, prefix_sep='__', columns=['agegroup', 'gender', 'occupation'])
users_df.mean()

user_id                      472.000000
agegroup__0                    0.433722
agegroup__1                    0.348887
agegroup__2                    0.217391
gender__F                      0.289502
gender__M                      0.710498
occupation__administrator      0.083775
occupation__artist             0.029692
occupation__doctor             0.007423
occupation__educator           0.100742
occupation__engineer           0.071050
occupation__entertainment      0.019088
occupation__executive          0.033934
occupation__healthcare         0.016967
occupation__homemaker          0.007423
occupation__lawyer             0.012725
occupation__librarian          0.054083
occupation__marketing          0.027572
occupation__none               0.009544
occupation__other              0.111347
occupation__programmer         0.069989
occupation__retired            0.014846
occupation__salesman           0.012725
occupation__scientist          0.032874
occupation__student            0.207847


#### Prepare Items Data

In [5]:
item_names = items_df[['item_id', 'item_name']]
item_names.head()

Unnamed: 0,item_id,item_name
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [6]:
items_df = items_df.drop(['item_name', 'release_date'], axis=1)
items_df.columns = ['item_id'] + ["genre__{}".format(col) for col in items_df.columns[1:]]
items_df.mean()

item_id               841.500000
genre__action           0.149227
genre__adventure        0.080262
genre__animation        0.024970
genre__childrens        0.072533
genre__comedy           0.300238
genre__crime            0.064804
genre__documentary      0.029727
genre__drama            0.431034
genre__fantasy          0.013080
genre__film_noir        0.014269
genre__horror           0.054697
genre__musical          0.033294
genre__mystery          0.036266
genre__romance          0.146849
genre__scifi            0.060048
genre__thriller         0.149227
genre__war              0.042212
genre__western          0.016052
dtype: float64

#### Prepare Ratings Data

In [7]:
ratings_df['timestamp'] = pd.to_datetime(ratings_df['unix_timestamp'], origin='unix', unit='s')
ratings_df = ratings_df.drop('unix_timestamp', axis=1)
ratings_df.mean()

user_id    462.48475
item_id    425.53013
rating       3.52986
dtype: float64

### Build the Sparse Model Matrix

In [640]:
class RankFM():
    """Factorization Machines for Ranking Problems with Implicit Feedback Data"""
    
    def __init__(self, factors=10, regularization=0.1, learning_rate=0.1, sigma=0.1):
        """store input hyperparameters and initialize internal data elements"""
        
        # hyperparameters
        self.factors = factors
        self.regularization = regularization
        self.learning_rate = learning_rate
        self.sigma = sigma
        
        # key column names
        self.user_id_ = None
        self.item_id_ = None
        self.rating_ = None
        
        # unique feature names
        self.users_ = None
        self.items_ = None
        self.user_features_ = None
        self.item_featuers_ = None
        
        # required internal data structures
        self.column_index_ = None
        self.model_matrix_ = None
        self.rating_vector_ = None
        
        # model weights
        self.w_constant_ = None
        self.w_features_ = None
        self.w_factors_ = None
        
        
    def initialize_weights(self):
        """initialize model weights after the model matrix has been prepared"""
        
        w_user = np.zeros(len(self.users_))
        w_item = np.zeros(len(self.items_))
        w_user_feature = np.zeros(len(self.user_features_))
        w_item_feature = np.zeros(len(self.item_features_))

        self.w_constant_ = self.rating_vector_.mean()
        self.w_features_ = np.concatenate([w_user, w_item, w_user_feature, w_item_feature])
        self.w_factors_  = np.random.normal(loc=0, scale=self.sigma, size=(len(self.w_features_), self.factors))
        
        
    def prepare_data(self, interactions, user_features, item_features):
        """build the column index, model matrix, and rating vector from components

        :param interactions: dataframe of [user_id, item_id, rating] records 
        :param user_features: dataframe of [user_id, user_feature_1, ... , user_feature_n] records
        :param item_features: dataframe of [item_id, item_feature_1, ... , item_feature_n] records
        :return: self
        """

        # identify the [user_id, item_id, rating] column names
        self.user_id_, self.item_id_, self.rating_ = interactions.columns

        # store the interactions data and assign a row index to use for the model matrix
        interactions_ = interactions.assign(row_index=np.arange(len(interactions)))
        self.rating_vector_ = interactions_[self.rating_].values

        # store unique feature names by namespace 
        self.users_ = np.sort(interactions_[self.user_id_].unique())
        self.items_ = np.sort(interactions_[self.item_id_].unique())
        self.user_features_ = np.sort(user_features.columns[1:])
        self.item_features_ = np.sort(item_features.columns[1:])

        # create namespace-specific column name lists
        user_cols = pd.DataFrame({'namespace': 'users', 'col_name': self.users_})
        item_cols = pd.DataFrame({'namespace': 'items', 'col_name': self.items_})
        user_features_cols = pd.DataFrame({'namespace': 'user_features', 'col_name': self.user_features_})
        item_features_cols = pd.DataFrame({'namespace': 'item_features', 'col_name': self.item_features_})

        # combine the column names from all namespaces and map to model matrix column indexes
        self.column_index_ = pd.concat([user_cols, item_cols, user_features_cols, item_features_cols], axis=0, ignore_index=True)
        self.column_index_['col_index'] = np.arange(len(self.column_index_))

        # separate the column names / indexes by namespace
        user_columns = self.column_index_[self.column_index_['namespace'] == 'users']
        item_columns = self.column_index_[self.column_index_['namespace'] == 'items']
        user_feature_columns = self.column_index_[self.column_index_['namespace'] == 'user_features']
        item_feature_columns = self.column_index_[self.column_index_['namespace'] == 'item_features']

        # reshape (user_features, item_features) long and drop zero-valued features to save space  
        user_features_long = pd.melt(user_features, id_vars=self.user_id_, var_name='feature_name', value_name='value').query('value > 0')
        item_features_long = pd.melt(item_features, id_vars=self.item_id_, var_name='feature_name', value_name='value').query('value > 0')

        # join the (user_features, item_features) with the interactions matrix to get dataframes at the [user_id, item_id, feature_id] level
        user_features_long = pd.merge(interactions_, user_features_long, on=self.user_id_, how='inner')
        item_features_long = pd.merge(interactions_, item_features_long, on=self.item_id_, how='inner')

        # create the component (row, col, val) tuples necessary to build the final sparse model matrix
        user_tuples = pd.merge(interactions_, user_columns, left_on=self.user_id_, right_on='col_name', how='inner')[['row_index', 'col_index']].assign(value=1)
        item_tuples = pd.merge(interactions_, item_columns, left_on=self.item_id_, right_on='col_name', how='inner')[['row_index', 'col_index']].assign(value=1)
        user_features_tuples = pd.merge(user_features_long, user_feature_columns, left_on='feature_name', right_on='col_name', how='inner')[['row_index', 'col_index', 'value']]
        item_features_tuples = pd.merge(item_features_long, item_feature_columns, left_on='feature_name', right_on='col_name', how='inner')[['row_index', 'col_index', 'value']]
        all_tuples = pd.concat([user_tuples, item_tuples, user_features_tuples, item_features_tuples], axis=0, ignore_index=True)

        # create the final sparse model matrix
        mm_input = (all_tuples['value'], (all_tuples['row_index'], all_tuples['col_index']))
        mm_shape = (len(interactions_), len(self.column_index_))
        self.model_matrix_ = csr_matrix(mm_input, shape=mm_shape)

        # calculate model matrix metadata
        interaction_sparsity = round(1 - (self.model_matrix_.shape[0] / (len(self.users_) * len(self.items_))), 4)
        nonzero_entries = self.model_matrix_.count_nonzero()
        storage_size = round(self.model_matrix_.data.nbytes / 1e6, 2)
        print(interaction_sparsity, nonzero_entries, storage_size)

        # initialize model weights and return object reference
        self.initialize_weights()
        return self
    
    
    def predict(self, X):
        """generate predicted ratings given input data rows and current model weights"""

        # create a generator object enumerating the index positions of all second-order interactions
        nonzero_col = X.sum(axis=0).nonzero()[1]
        nonzero_col_pairs = combinations(nonzero_col, 2)
        
        # create a square matrix with all factor dot products
        factor_products = np.dot(self.w_factors_, self.w_factors_.T)

        # calculate the predicted rating as the sum of three terms
        term_1 = self.w_constant_
        term_2 = np.dot(X, self.w_features_).T
        term_3 = np.hstack([factor_products[i,j] * np.multiply(X[:,i], X[:,j]) for i, j in nonzero_col_pairs]).sum(axis=1)

        # final prediction is the sum of all three terms
        predictions = term_1 + term_2 + term_3
        return predictions
    
    
    def loss_value(self, w_constant, w_features, w_factors, X, y):
        """calculate regularized squared error loss"""
    
        # compute MSE loss and regularization penalty
        predictions = self.predict(X)
        mse = jnp.sum(jnp.square(y - predictions))
        penalty = sum([jnp.sum(self.regularization * jnp.square(w)) for w in [w_constant, w_features, w_factors]])

        # return combined loss
        loss = mse + penalty
        return loss

    
    # auto-diff the loss function wrt all weights
    loss_gradient = grad(loss_value, argnums=[1, 2, 3])
    
    
    def create_batches(self, X, y, batch_size):
        """generator function to batch inputs for training"""
    
        # create the beg/end row indices for each batch
        beg_index = range(0, len(y), batch_size)
        end_index = range(batch_size, len(y) + batch_size, batch_size)

        # yield (X, y) tuples for each batch
        for beg, end in zip(beg_index, end_index):
            batch = X[beg:end].todense(), y[beg:end]
            yield batch
            
    
    def fit(self, X, y, batch_size=10):
        """update model weights by mini-batch"""
        
        # create a lazy iterator of (X, y) batches for training
        batches = self.create_batches(X, y, batch_size)
        
        # train model weights one batch at a time
        for i, (batch_x, batch_y) in enumerate(batches):
            # print("current batch: {}".format(i))
            gradients = self.loss_gradient(self.w_constant_, self.w_features_, self.w_factors_, batch_x, batch_y)
            self.w_constant_ -= self.learning_rate * gradients[0]
            self.w_features_ -= self.learning_rate * gradients[1]
            self.w_factors_  -= self.learning_rate * gradients[2]
            
        # return object reference with updated model weights
        return self
        

#### Initialize the Model Object

In [647]:
rankfm = RankFM(factors=10, regularization=0.01, learning_rate=0.01, sigma=0.1)
rankfm

<__main__.RankFM at 0x1c285926d8>

#### Build the Model Matrix and Initialize the Weights

In [648]:
interactions_df = ratings_df[['user_id', 'item_id', 'rating']]
rankfm.prepare_data(interactions_df, users_df, items_df)

0.937 712585 5.7


<__main__.RankFM at 0x1c285926d8>

#### Split Training/Validation Matrices

In [649]:
n_train = 10000
n_valid = 1000

In [650]:
X_train = rankfm.model_matrix_[0:n_train]
y_train = rankfm.rating_vector_[0:n_train]
X_train.shape, y_train.shape

((10000, 2669), (10000,))

In [651]:
X_valid = rankfm.model_matrix_[n_train:(n_train + n_valid)]
y_valid = rankfm.rating_vector_[n_train:(n_train + n_valid)]
X_valid.shape, y_valid.shape

((1000, 2669), (1000,))

#### Perform Batch Updates

In [652]:
loss = rankfm.loss_value(rankfm.w_constant_, rankfm.w_features_, rankfm.w_factors_, X_valid.todense(), y_valid)
print("validation loss: {}".format(loss))
np.sum(np.abs(rankfm.w_factors_))

validation loss: 1360497.875


2125.0487153410636

In [653]:
rankfm.fit(X_train, y_train, batch_size=100)

<__main__.RankFM at 0x1c285926d8>

In [654]:
loss = rankfm.loss_value(rankfm.w_constant_, rankfm.w_features_, rankfm.w_factors_, X_valid.todense(), y_valid)
print("validation loss: {}".format(loss))
np.sum(np.abs(rankfm.w_factors_))

validation loss: 1360831.625


2082.9656

### It's Alive!

# START SANDBOX CODE

In [302]:
rankfm.w_factors_.shape

(2669, 10)

In [303]:
factor_products = np.dot(rankfm.w_factors_, rankfm.w_factors_.T)
factor_products.shape

(2669, 2669)

In [304]:
np.dot(rankfm.w_factors_[2], rankfm.w_factors_[4])

-0.024916870506699763

In [273]:
factor_products[2, 4]

6.5502465e-12

In [188]:
nonzero_col = X.sum(axis=0).nonzero()[1]
nonzero_col_pairs = combinations(nonzero_col, 2)
len(list(nonzero_col_pairs))

23436

In [204]:
nonzero_indexes = X.sum(axis=0).nonzero()[1]

In [205]:
nonzero_indexes

array([   5,    6,    7,    9,   10,   12,   19,   21,   24,   27,   37,
         41,   49,   56,   58,   59,   61,   62,   71,   80,   86,   91,
         94,   96,   98,  101,  114,  118,  121,  126,  134,  137,  144,
        156,  159,  161,  165,  166,  177,  180,  185,  188,  193,  195,
        199,  200,  209,  221,  222,  223,  224,  233,  240,  241,  242,
        243,  245,  248,  250,  252,  253,  259,  266,  275,  277,  278,
        283,  285,  286,  289,  290,  291,  292,  297,  298,  300,  302,
        304,  307,  943,  946,  947,  957,  958,  962,  965,  967,  968,
        971,  974,  982,  993,  996, 1028, 1030, 1037, 1040, 1042, 1053,
       1060, 1085, 1086, 1096, 1107, 1123, 1135, 1136, 1138, 1143, 1151,
       1161, 1164, 1171, 1174, 1176, 1179, 1183, 1184, 1188, 1190, 1199,
       1207, 1216, 1217, 1219, 1230, 1244, 1246, 1264, 1269, 1274, 1280,
       1288, 1308, 1319, 1324, 1326, 1329, 1334, 1345, 1358, 1365, 1369,
       1374, 1393, 1407, 1416, 1422, 1428, 1440, 14

In [207]:
nonzero_index_pairs = combinations(nonzero_indexes, 2)

#### Do Some Diagnostic Checks

In [700]:
column_index.head()

Unnamed: 0,namespace,col_name,col_index
0,const,const,0
1,users,1,1
2,users,2,2
3,users,3,3
4,users,4,4


In [62]:
column_index.col_index.values

array([   0,    1,    2, ..., 2666, 2667, 2668])

In [701]:
column_index.namespace.value_counts()

items            1682
users             943
user_features      26
item_features      18
const               1
Name: namespace, dtype: int64

In [702]:
type(model_matrix), model_matrix.shape

(scipy.sparse.csr.csr_matrix, (100000, 2670))

In [703]:
1 + users.shape[0] + items.shape[0] + (users.shape[1] - 1) + (items.shape[1] - 1)

2670

In [704]:
interactions.head()

Unnamed: 0,user_id,item_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [706]:
rating_vector.head()

Unnamed: 0_level_0,rating
row_index,Unnamed: 1_level_1
0,3
1,3
2,1
3,2
4,1


In [710]:
row_number = 3
nonzero_columns = mm_sample.iloc[row_number, :].T.reset_index()
nonzero_columns[nonzero_columns[row_number] > 0]

Unnamed: 0,col_name,3
0,const,1
244,244,1
994,51,1
2626,agegroup__0,1
2630,gender__M,1
2650,occupation__technician,1
2659,genre__drama,1
2665,genre__romance,1
2668,genre__war,1
2669,genre__western,1


In [711]:
users[users.user_id == 244]

Unnamed: 0,user_id,agegroup__0,agegroup__1,agegroup__2,gender__F,gender__M,occupation__administrator,occupation__artist,occupation__doctor,occupation__educator,...,occupation__marketing,occupation__none,occupation__other,occupation__programmer,occupation__retired,occupation__salesman,occupation__scientist,occupation__student,occupation__technician,occupation__writer
243,244,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [712]:
items[items.item_id == 51]

Unnamed: 0,item_id,genre__action,genre__adventure,genre__animation,genre__childrens,genre__comedy,genre__crime,genre__documentary,genre__drama,genre__fantasy,genre__film_noir,genre__horror,genre__musical,genre__mystery,genre__romance,genre__scifi,genre__thriller,genre__war,genre__western
50,51,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,1


### More Sandbox

In [38]:
X = np.array([[1,2,3],[4,5,6],[7,8,9]])
X

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [48]:
X.sum(axis=0) # .nonzero() # .tolist() # .shape

(array([0, 1, 2]),)

In [150]:
X = rankfm.model_matrix_[:105]
X.shape

(105, 2669)

In [151]:
y = rankfm.rating_vector_[:105]
y.shape

(105,)

In [154]:
def create_batches(X, y, batch_size=10):
    """generator function to batch inputs for training"""
    
    beg_index = range(0, len(y), batch_size)
    end_index = range(batch_size, len(y) + batch_size, batch_size)
    
    for beg, end in zip(beg_index, end_index):
        batch = X[beg:end].todense(), y[beg:end]
        yield batch
    
    

In [155]:
for batch_x, batch_y in create_batches(X, y, batch_size=10):
    print(batch_x.shape, batch_y.shape)
    


(10, 2669) (10,)
(10, 2669) (10,)
(10, 2669) (10,)
(10, 2669) (10,)
(10, 2669) (10,)
(10, 2669) (10,)
(10, 2669) (10,)
(10, 2669) (10,)
(10, 2669) (10,)
(10, 2669) (10,)
(5, 2669) (5,)


In [149]:
X[100:110].todense().shape

(5, 2669)