### Import Required Packages and Set Options

#### Import Base Libraries

In [1]:
import os
import sys
import random

import numpy as np
import pandas as pd
import numba as nb

import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime
from functools import partial

#### Put the Main Package Library on the PYTHONPATH

In [2]:
curdir = sys.path[0]
srcdir = os.path.join(os.path.split(curdir)[0], 'rankfm')
sys.path[0] = srcdir
srcdir

'/Users/ericlundquist/Repos/rankfm/rankfm'

#### Re-Load all Package Modules on Execution for Testing

In [3]:
from rankfm import RankFM
from evaluation import precision_at_k, recall_at_k

#### Set File Path Constants

In [4]:
REPO_ROOT = os.path.split(srcdir)[0]
DATA_ROOT = os.path.join(REPO_ROOT, "data/ml-100k")
print("\n".join([REPO_ROOT, DATA_ROOT]))

/Users/ericlundquist/Repos/rankfm
/Users/ericlundquist/Repos/rankfm/data/ml-100k


### Prepare Example Data

#### Load Users Data

In [5]:
users_df = pd.read_csv(os.path.join(DATA_ROOT, "users.csv"))
users_df['agegroup'] = pd.cut(users_df['age'], [0, 30, 45, 100], right=False, labels=False)
users_df = users_df.drop(['age', 'zip_code'], axis=1)
users_df = pd.get_dummies(users_df, prefix_sep='__', columns=['agegroup', 'gender', 'occupation'])
users_df.mean()

user_id                      472.000000
agegroup__0                    0.433722
agegroup__1                    0.348887
agegroup__2                    0.217391
gender__F                      0.289502
gender__M                      0.710498
occupation__administrator      0.083775
occupation__artist             0.029692
occupation__doctor             0.007423
occupation__educator           0.100742
occupation__engineer           0.071050
occupation__entertainment      0.019088
occupation__executive          0.033934
occupation__healthcare         0.016967
occupation__homemaker          0.007423
occupation__lawyer             0.012725
occupation__librarian          0.054083
occupation__marketing          0.027572
occupation__none               0.009544
occupation__other              0.111347
occupation__programmer         0.069989
occupation__retired            0.014846
occupation__salesman           0.012725
occupation__scientist          0.032874
occupation__student            0.207847


#### Load Items Data

In [6]:
items_df = pd.read_csv(os.path.join(DATA_ROOT, "items.csv"))
item_names = items_df[['item_id', 'item_name']]
item_names.head()

Unnamed: 0,item_id,item_name
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [7]:
items_df = items_df.drop(['item_name', 'release_date'], axis=1)
items_df.columns = ['item_id'] + ["genre__{}".format(col) for col in items_df.columns[1:]]
items_df.mean()

item_id               841.500000
genre__action           0.149227
genre__adventure        0.080262
genre__animation        0.024970
genre__childrens        0.072533
genre__comedy           0.300238
genre__crime            0.064804
genre__documentary      0.029727
genre__drama            0.431034
genre__fantasy          0.013080
genre__film_noir        0.014269
genre__horror           0.054697
genre__musical          0.033294
genre__mystery          0.036266
genre__romance          0.146849
genre__scifi            0.060048
genre__thriller         0.149227
genre__war              0.042212
genre__western          0.016052
dtype: float64

#### Load Ratings Data

In [8]:
ratings_explicit = pd.read_csv(os.path.join(DATA_ROOT, "ratings.csv"))
ratings_explicit['timestamp'] = pd.to_datetime(ratings_explicit['unix_timestamp'], origin='unix', unit='s')
ratings_explicit['positive_feedback'] = ratings_explicit.groupby('user_id')['rating'].transform(lambda c: np.where(c > c.mean(), 1, 0))
ratings_explicit = ratings_explicit.drop('unix_timestamp', axis=1)
ratings_explicit.mean()

user_id              462.48475
item_id              425.53013
rating                 3.52986
positive_feedback      0.54194
dtype: float64

In [9]:
ratings_explicit.head()

Unnamed: 0,user_id,item_id,rating,timestamp,positive_feedback
0,196,242,3,1997-12-04 15:55:49,0
1,186,302,3,1998-04-04 19:22:22,0
2,22,377,1,1997-11-07 07:18:36,0
3,244,51,2,1997-11-27 05:02:03,0
4,166,346,1,1998-02-02 05:33:16,0


#### Generate Implicit Feedback Ratings Data

In [10]:
ratings_implicit = ratings_explicit[ratings_explicit.positive_feedback == 1].reset_index(drop=True)
ratings_implicit.head()

Unnamed: 0,user_id,item_id,rating,timestamp,positive_feedback
0,253,465,5,1998-04-03 18:34:27,1
1,286,1014,5,1997-11-17 15:38:45,1
2,200,222,5,1997-10-05 09:05:40,1
3,224,29,3,1998-02-21 23:40:57,1
4,122,387,5,1997-11-11 17:47:39,1


#### Print Final Matrix Shapes

In [11]:
print(ratings_explicit.user_id.nunique(), ratings_explicit.item_id.nunique())
print(ratings_implicit.user_id.nunique(), ratings_implicit.item_id.nunique())

print(ratings_explicit.shape)
print(ratings_implicit.shape)

943 1682
943 1483
(100000, 5)
(54194, 5)


#### Create Simple Interaction Data for Testing

In [23]:
interactions = ratings_explicit[['user_id', 'item_id']].astype(np.int32)
interactions.head()

Unnamed: 0,user_id,item_id
0,196,242
1,186,302
2,22,377
3,244,51
4,166,346


In [24]:
print(interactions.shape)
print(interactions.dtypes)

(100000, 2)
user_id    int32
item_id    int32
dtype: object


#### Create Additional Inputs

In [25]:
all_users = np.sort(np.unique(interactions.user_id)).astype(np.int32)
all_items = np.sort(np.unique(interactions.item_id)).astype(np.int32)
len(all_users), len(all_items)

(943, 1682)

In [26]:
type(all_users), type(all_items)

(numpy.ndarray, numpy.ndarray)

#### Create a Dictionary Lookup of Each User's Items

In [39]:
user_items_py = interactions.groupby('user_id')['item_id'].apply(np.array, dtype=np.int32).to_dict()

In [40]:
print(len(user_items_py.keys()))
print(len(user_items_py[2]))
user_items_py[2]

943
62


array([292, 251,  50, 314, 297, 290, 312, 281,  13, 280, 303, 308, 307,
       257, 316, 315, 301, 313, 279, 299, 298,  19, 277, 282, 111, 258,
       295, 242, 283, 276,   1, 305,  14, 287, 291, 293, 294, 310, 309,
       306,  25, 273,  10, 311, 269, 255, 284, 274, 237, 300, 100, 127,
       285, 289, 304, 272, 278, 288, 286, 275, 302, 296], dtype=int32)

In [41]:
type(list(user_items_py.keys())[0]), type(user_items_py[1])

(int, numpy.ndarray)

#### Convert to Numba Dictionary

In [42]:
user_items_nb = nb.typed.Dict.empty(key_type=nb.types.int32, value_type=nb.types.int32[:])

In [43]:
for key, val in user_items_py.items():
    user_items_nb[key] = val

In [44]:
print(len(user_items_nb.keys()))
print(len(user_items_nb[2]))
user_items_nb[2]

943
62


array([292, 251,  50, 314, 297, 290, 312, 281,  13, 280, 303, 308, 307,
       257, 316, 315, 301, 313, 279, 299, 298,  19, 277, 282, 111, 258,
       295, 242, 283, 276,   1, 305,  14, 287, 291, 293, 294, 310, 309,
       306,  25, 273,  10, 311, 269, 255, 284, 274, 237, 300, 100, 127,
       285, 289, 304, 272, 278, 288, 286, 275, 302, 296], dtype=int32)

In [45]:
type(user_items_nb)

numba.typed.typeddict.Dict

#### Attempt to Re-Write the (u, i, j) Sampling as a Numba Function

In [46]:
interactions = interactions.to_numpy()
interactions.shape, interactions.dtype

((100000, 2), dtype('int32'))

In [47]:
all_items.shape, all_items.dtype

((1682,), dtype('int32'))

In [48]:
type(user_items_nb)

numba.typed.typeddict.Dict

In [50]:
@nb.njit
def test_fit_1(interactions, items, user_items):
    """return a series of (u, i, j) samples"""
    
    running_sum = 0
    for r in range(len(interactions)):
        u = interactions[r, 0]
        i = interactions[r, 1]
        j = np.random.choice(np.array(list(set(items) - set(user_items[u])), dtype=np.int32))
        running_sum += u + (i - j)
    return running_sum

In [127]:
@nb.njit
def isin(needle, haystack):
    for i in range(len(haystack)):
        if needle == haystack[i]:
            return True
    return False

@nb.njit
def test_fit_2(interactions, items, user_items):
    """return a series of (u, i, j) samples"""
    
    running_sum_1 = 0
    running_sum_2 = 0
    
    for row in range(len(interactions)):
        
        # identify the user (u) and observed item (i)
        u = interactions[row, 0]
        i = interactions[row, 1]
        n_items = len(items)
        
        # randomly sample an unobserved item (j)
        while True:
            j = int(n_items * random.random())
            if not isin(j, user_items[u]):
                break
            
        running_sum_1 += u + (i - j)
        running_sum_2 += u + (j - i)
        
    return running_sum_1, running_sum_2
   

In [128]:
res_1, res_2 = test_fit_2(interactions, all_items, user_items_nb)
res_1, res_2

(-944309, 93441259)

#### Profile the Speed of the Sampling Algorithms

In [129]:
%%timeit
test_fit_1(interactions, all_items, user_items_nb)

2.55 s ± 48.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [130]:
%%timeit
test_fit_2(interactions, all_items, user_items_nb)

18.1 ms ± 470 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


#### Ensure the Correctness of the Sampling Algorithm

In [131]:
def check_isin(needle, haystack):
    for i in range(len(haystack)):
        if needle == haystack[i]:
            return True
    return False


def check_test_fit_2(interactions, items, user_items):
    """return a series of (u, i, j) samples"""
    
    samples = []
    for row in range(len(interactions)):
        
        # identify the user (u) and observed item (i)
        u = interactions[row, 0]
        i = interactions[row, 1]
        n_items = len(items)
        
        # randomly sample an unobserved item (j)
        while True:
            j = int(n_items * random.random())
            if not isin(j, user_items[u]):
                break
                
        samples.append((u, i, j))
        
    return samples

In [151]:
samples = check_test_fit_2(interactions[:100], all_items, user_items_py)

In [152]:
interactions[:5]

array([[196, 242],
       [186, 302],
       [ 22, 377],
       [244,  51],
       [166, 346]], dtype=int32)

In [153]:
for u, i, j in samples:
    template = "u:{:<4} i:{:<4} j:{:<4} | i in observed: {} | j in observed: {} | j in all items: {}"
    print(template.format(u, i, j, i in user_items_py[u], j in user_items_py[u], j in all_items))

u:196  i:242  j:130  | i in observed: True | j in observed: False | j in all items: True
u:186  i:302  j:1494 | i in observed: True | j in observed: False | j in all items: True
u:22   i:377  j:126  | i in observed: True | j in observed: False | j in all items: True
u:244  i:51   j:733  | i in observed: True | j in observed: False | j in all items: True
u:166  i:346  j:1333 | i in observed: True | j in observed: False | j in all items: True
u:298  i:474  j:639  | i in observed: True | j in observed: False | j in all items: True
u:115  i:265  j:1216 | i in observed: True | j in observed: False | j in all items: True
u:253  i:465  j:1321 | i in observed: True | j in observed: False | j in all items: True
u:305  i:451  j:1042 | i in observed: True | j in observed: False | j in all items: True
u:6    i:86   j:910  | i in observed: True | j in observed: False | j in all items: True
u:62   i:257  j:1621 | i in observed: True | j in observed: False | j in all items: True
u:286  i:1014 j:276  