### Import Required Packages and Set Options

#### Import Base Libraries

In [1]:
import os
import sys
import random

import numpy as np
import pandas as pd
import numba as nb

import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime
from functools import partial

#### Put the Main Package Library on the PYTHONPATH

In [2]:
curdir = sys.path[0]
srcdir = os.path.join(os.path.split(curdir)[0], 'rankfm')
sys.path[0] = srcdir
srcdir

'/Users/ericlundquist/Repos/rankfm/rankfm'

#### Re-Load all Package Modules on Execution for Testing

In [3]:
%load_ext autoreload
%autoreload 2

from rankfm import RankFM
from evaluation import precision_at_k, recall_at_k

#### Set File Path Constants

In [4]:
REPO_ROOT = os.path.split(srcdir)[0]
DATA_ROOT = os.path.join(REPO_ROOT, "data/ml-100k")
print("\n".join([REPO_ROOT, DATA_ROOT]))

/Users/ericlundquist/Repos/rankfm
/Users/ericlundquist/Repos/rankfm/data/ml-100k


### Prepare Example Data

#### Load Users Data

In [5]:
users_df = pd.read_csv(os.path.join(DATA_ROOT, "users.csv"))
users_df['agegroup'] = pd.cut(users_df['age'], [0, 30, 45, 100], right=False, labels=False)
users_df = users_df.drop(['age', 'zip_code'], axis=1)
users_df = pd.get_dummies(users_df, prefix_sep='__', columns=['agegroup', 'gender', 'occupation'])

#### Load Items Data

In [6]:
items_df = pd.read_csv(os.path.join(DATA_ROOT, "items.csv"))
item_names = items_df[['item_id', 'item_name']]
item_names.head()

Unnamed: 0,item_id,item_name
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [7]:
items_df = items_df.drop(['item_name', 'release_date'], axis=1)
items_df.columns = ['item_id'] + ["genre__{}".format(col) for col in items_df.columns[1:]]

#### Load Ratings Data

In [8]:
ratings_explicit = pd.read_csv(os.path.join(DATA_ROOT, "ratings.csv"))
ratings_explicit['timestamp'] = pd.to_datetime(ratings_explicit['unix_timestamp'], origin='unix', unit='s')
ratings_explicit['positive_feedback'] = ratings_explicit.groupby('user_id')['rating'].transform(lambda c: np.where(c > c.mean(), 1, 0))
ratings_explicit = ratings_explicit.drop('unix_timestamp', axis=1)
ratings_explicit.mean()

user_id              462.48475
item_id              425.53013
rating                 3.52986
positive_feedback      0.54194
dtype: float64

#### Generate Implicit Feedback Ratings Data

In [9]:
ratings_implicit = ratings_explicit[ratings_explicit.positive_feedback == 1].reset_index(drop=True)
ratings_implicit.head()

Unnamed: 0,user_id,item_id,rating,timestamp,positive_feedback
0,253,465,5,1998-04-03 18:34:27,1
1,286,1014,5,1997-11-17 15:38:45,1
2,200,222,5,1997-10-05 09:05:40,1
3,224,29,3,1998-02-21 23:40:57,1
4,122,387,5,1997-11-11 17:47:39,1


#### Print Final Matrix Shapes

In [10]:
print(ratings_explicit.user_id.nunique(), ratings_explicit.item_id.nunique())
print(ratings_implicit.user_id.nunique(), ratings_implicit.item_id.nunique())

print(ratings_explicit.shape)
print(ratings_implicit.shape)

943 1682
943 1483
(100000, 5)
(54194, 5)


#### Create Simple Interaction Data for Testing

In [11]:
interactions_implicit = ratings_implicit[['user_id', 'item_id']].astype(np.int32)
interactions_implicit.head()

Unnamed: 0,user_id,item_id
0,253,465
1,286,1014
2,200,222
3,224,29
4,122,387


In [12]:
user_features = users_df[users_df.user_id.isin(interactions_implicit.user_id.unique())]
item_features = items_df[items_df.item_id.isin(interactions_implicit.item_id.unique())]

In [13]:
print(interactions_implicit.shape)
print(interactions_implicit.dtypes)

(54194, 2)
user_id    int32
item_id    int32
dtype: object


In [14]:
print(user_features.shape)
print(item_features.shape)

(943, 27)
(1483, 19)


### Train the Model Using Numba

#### Initialize Internal Data Using the Main Modeling Class

In [15]:
rankfm = RankFM()

In [16]:
# %%timeit
rankfm._init_all(interactions_implicit, user_features, item_features)

  d[key] = value


#### Check the Internal Data

In [17]:
type(rankfm.interactions), rankfm.interactions.shape, rankfm.interactions.dtype, round(rankfm.interactions.nbytes / 1e6, 2)

(numpy.ndarray, (54194, 2), dtype('int32'), 0.43)

In [18]:
round(sys.getsizeof(rankfm.user_items_py) / 1e6, 2), round(sys.getsizeof(rankfm.user_items_nb) / 1e6, 2)

(0.04, 0.0)

In [19]:
print(len(rankfm.user_items_py.keys()))
rankfm.user_items_py[1]

943


array([289, 248,  49, 294,  12, 300, 254, 312, 298, 310, 276, 296, 274,
       279, 108, 292, 239, 280, 273,   0,  13, 290, 307, 303,  24, 270,
       308, 266, 252, 281, 234, 297,  99, 124, 282, 301, 269, 283, 272,
       299], dtype=int32)

In [20]:
print(len(rankfm.user_items_nb.keys()))
rankfm.user_items_nb[1]

943


array([289, 248,  49, 294,  12, 300, 254, 312, 298, 310, 276, 296, 274,
       279, 108, 292, 239, 280, 273,   0,  13, 290, 307, 303,  24, 270,
       308, 266, 252, 281, 234, 297,  99, 124, 282, 301, 269, 283, 272,
       299], dtype=int32)

#### Define the Main Internal `_fit()` Function

In [94]:
@nb.njit(cache=True)
def _fit(interactions, item_idx, user_items, x_uf, x_if, w_i, w_if, v_u, v_i, v_uf, v_if, regularization, learning_rate, epochs=1):
    """main internal NJIT fitting function"""
    
    # record user-feature/item-feature/latent-factor dimensions
    # ---------------------------------------------------------
    
    P = x_uf.shape[1]
    Q = x_if.shape[1]
    F = v_i.shape[1]
    
    # define inner functions needed for training 
    # ------------------------------------------
    
    def isin(needle, haystack):
        for i in range(len(haystack)):
            if needle == haystack[i]:
                return True
        return False
    
    # start the main training loop
    # ----------------------------
    
    for epoch in range(epochs):
        
        print("beginning training epoch:", epoch)
        shuffle_index = np.arange(len(interactions))
        np.random.shuffle(shuffle_index)
        interactions = interactions[shuffle_index]
        
        for row in range(len(interactions)):

            u = interactions[row, 0]
            i = interactions[row, 1]
            n_items = len(item_idx)
            
            # sample a random negative item for the user
            # ------------------------------------------

            while True:
                j = int(n_items * random.random())
                if not isin(j, user_items[u]):
                    break
            
            # calculate pairwise utility of (u, i, j) sample
            # ----------------------------------------------
            
            u_item = w_i[i] - w_i[j]
            u_item_features = np.dot(x_if[i] - x_if[j], w_if)
            u_item_user = np.dot(v_i[i] - v_i[j], v_u[u])
            u_user_item_features = np.dot(x_if[i] - x_if[j], np.dot(v_if, v_u[u]))
            u_item_user_features = np.dot(x_uf[u], np.dot(v_uf, v_i[i] - v_i[j]))
            u_feature_interactions = np.dot(np.dot(v_uf.T, x_uf[u]), np.dot(v_if.T, x_if[i] - x_if[j]))
            utility = u_item + u_item_features + u_item_user + u_user_item_features + u_item_user_features + u_feature_interactions
            
            # calculate gradients (d_LL/d_theta) wrt current sample
            # -----------------------------------------------------

            d_con = 1.0 / (np.exp(utility) + 1.0)
            d_reg = 2.0 * regularization

            d_w_i = 1.0
            d_w_j = -1.0
            d_w_if = x_if[i] - x_if[j]

            d_v_u = v_i[i] - v_i[j] + np.dot(v_if.T, x_if[i] - x_if[j])
            d_v_i = v_u[u] + np.dot(v_uf.T, x_uf[u])
            d_v_j = -v_u[u] - np.dot(v_uf.T, x_uf[u])

            d_v_uf = np.empty((P, F), np.float32)
            d_v_if = np.empty((Q, F), np.float32)

            for f in range(F):
                for p in range(P):
                    if (x_uf[u][p]) == 0.0:
                        d_v_uf[p, f] = 0.0
                    else:
                        d_v_uf[p, f] = (x_uf[u][p]) * (v_i[i][f] - v_i[j][f] + np.dot(v_if.T[f], x_if[i] - x_if[j]))
                for q in range(Q):
                    if (x_if[i][q] - x_if[j][q]) == 0.0:
                        d_v_if[q, f] = 0.0
                    else:
                        d_v_if[q, f] = (x_if[i][q] - x_if[j][q]) * (v_u[u][f] + np.dot(v_uf.T[f], x_uf[u]))
                        
            # update model weights based on the calculated gradients
            # ------------------------------------------------------

            w_i[i] += learning_rate * ((d_con * d_w_i)  - (d_reg * w_i[i]))
            w_i[j] += learning_rate * ((d_con * d_w_j)  - (d_reg * w_i[j]))
            w_if   += learning_rate * ((d_con * d_w_if) - (d_reg * w_if))
            v_u[u] += learning_rate * ((d_con * d_v_u)  - (d_reg * v_u[u]))
            v_i[i] += learning_rate * ((d_con * d_v_i)  - (d_reg * v_i[i]))
            v_i[j] += learning_rate * ((d_con * d_v_j)  - (d_reg * v_i[j]))
            v_uf   += learning_rate * ((d_con * d_v_uf) - (d_reg * v_uf))
            v_if   += learning_rate * ((d_con * d_v_if) - (d_reg * v_if))
            
    # return updated model weights
    return w_i, w_if, v_u, v_i, v_uf, v_if
            
    

#### Define All Necessary Training Variables in Local Scope

In [95]:
item_idx = rankfm.item_idx
interactions = rankfm.interactions
user_items = rankfm.user_items_nb

x_uf = rankfm.x_uf
x_if = rankfm.x_if

w_i = rankfm.w_i
w_if = rankfm.w_if

v_u = rankfm.v_u
v_i = rankfm.v_i
v_uf = rankfm.v_uf
v_if = rankfm.v_if

regularization = rankfm.regularization
learning_rate = rankfm.learning_rate

##### all user/item index variables should be **int32**

In [96]:
item_idx.dtype, interactions.dtype, type(list(user_items.keys())[1]), user_items[1].dtype

(dtype('int32'), dtype('int32'), int, dtype('int32'))

##### all user/item features shouldbe **float32**

In [97]:
x_uf.dtype, x_if.dtype

(dtype('float32'), dtype('float32'))

##### all weights/factors should be **float32**

In [98]:
w_i.dtype, w_if.dtype, v_u.dtype, v_i.dtype, v_uf.dtype, v_if.dtype

(dtype('float32'),
 dtype('float32'),
 dtype('float32'),
 dtype('float32'),
 dtype('float32'),
 dtype('float32'))

#### Attempt to Call the Main Fit Function

In [93]:
%%time
w_i, w_if, v_u, v_i, v_uf, v_if = _fit(interactions, item_idx, user_items, x_uf, x_if, w_i, w_if, v_u, v_i, v_uf, v_if, regularization, learning_rate, epochs=10)

beginning training epoch: 0
beginning training epoch: 1
beginning training epoch: 2
beginning training epoch: 3
beginning training epoch: 4
beginning training epoch: 5
beginning training epoch: 6
beginning training epoch: 7
beginning training epoch: 8
beginning training epoch: 9
CPU times: user 16.3 s, sys: 91 ms, total: 16.4 s
Wall time: 16.4 s
