### Import Required Packages and Set Options

#### Import Base Libraries

In [1]:
import os
import sys

import numpy as np
import pandas as pd
import multiprocessing as mp

import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime
from functools import partial

#### Put the Main Package Library on the PYTHONPATH

In [2]:
curdir = sys.path[0]
srcdir = os.path.join(os.path.split(curdir)[0], 'rankfm')
sys.path[0] = srcdir
srcdir

'/Users/ericlundquist/Repos/rankfm/rankfm'

#### Dynamically Re-Load all Package Modules on Execution

In [3]:
%load_ext autoreload
%autoreload 2

from rankfm import RankFM
from evaluation import precision_at_k, recall_at_k

#### Set File Path Constants

In [4]:
REPO_ROOT = os.path.split(srcdir)[0]
DATA_ROOT = os.path.join(REPO_ROOT, "data/ml-100k")
print("\n".join([REPO_ROOT, DATA_ROOT]))

/Users/ericlundquist/Repos/rankfm
/Users/ericlundquist/Repos/rankfm/data/ml-100k


### Prepare Example Data

#### Load Users Data

In [5]:
users_df = pd.read_csv(os.path.join(DATA_ROOT, "users.csv"))
users_df['agegroup'] = pd.cut(users_df['age'], [0, 30, 45, 100], right=False, labels=False)
users_df = users_df.drop(['age', 'zip_code'], axis=1)
users_df = pd.get_dummies(users_df, prefix_sep='__', columns=['agegroup', 'gender', 'occupation'])
users_df.mean()

user_id                      472.000000
agegroup__0                    0.433722
agegroup__1                    0.348887
agegroup__2                    0.217391
gender__F                      0.289502
gender__M                      0.710498
occupation__administrator      0.083775
occupation__artist             0.029692
occupation__doctor             0.007423
occupation__educator           0.100742
occupation__engineer           0.071050
occupation__entertainment      0.019088
occupation__executive          0.033934
occupation__healthcare         0.016967
occupation__homemaker          0.007423
occupation__lawyer             0.012725
occupation__librarian          0.054083
occupation__marketing          0.027572
occupation__none               0.009544
occupation__other              0.111347
occupation__programmer         0.069989
occupation__retired            0.014846
occupation__salesman           0.012725
occupation__scientist          0.032874
occupation__student            0.207847


#### Load Items Data

In [6]:
items_df = pd.read_csv(os.path.join(DATA_ROOT, "items.csv"))
item_names = items_df[['item_id', 'item_name']]
item_names.head()

Unnamed: 0,item_id,item_name
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [7]:
items_df = items_df.drop(['item_name', 'release_date'], axis=1)
items_df.columns = ['item_id'] + ["genre__{}".format(col) for col in items_df.columns[1:]]
items_df.mean()

item_id               841.500000
genre__action           0.149227
genre__adventure        0.080262
genre__animation        0.024970
genre__childrens        0.072533
genre__comedy           0.300238
genre__crime            0.064804
genre__documentary      0.029727
genre__drama            0.431034
genre__fantasy          0.013080
genre__film_noir        0.014269
genre__horror           0.054697
genre__musical          0.033294
genre__mystery          0.036266
genre__romance          0.146849
genre__scifi            0.060048
genre__thriller         0.149227
genre__war              0.042212
genre__western          0.016052
dtype: float64

#### Load Ratings Data

In [37]:
ratings_explicit = pd.read_csv(os.path.join(DATA_ROOT, "ratings.csv"))
ratings_explicit['timestamp'] = pd.to_datetime(ratings_explicit['unix_timestamp'], origin='unix', unit='s')
ratings_explicit['positive_feedback'] = ratings_explicit.groupby('user_id')['rating'].transform(lambda c: np.where(c > c.mean(), 1, 0))
ratings_explicit = ratings_explicit.drop('unix_timestamp', axis=1)
ratings_explicit.mean()

user_id              462.48475
item_id              425.53013
rating                 3.52986
positive_feedback      0.54194
dtype: float64

In [38]:
ratings_explicit.head()

Unnamed: 0,user_id,item_id,rating,timestamp,positive_feedback
0,196,242,3,1997-12-04 15:55:49,0
1,186,302,3,1998-04-04 19:22:22,0
2,22,377,1,1997-11-07 07:18:36,0
3,244,51,2,1997-11-27 05:02:03,0
4,166,346,1,1998-02-02 05:33:16,0


#### Generate Implicit Feedback Ratings Data

In [39]:
ratings_implicit = ratings_explicit[ratings_explicit.positive_feedback == 1].reset_index(drop=True)
ratings_implicit.head()

Unnamed: 0,user_id,item_id,rating,timestamp,positive_feedback
0,253,465,5,1998-04-03 18:34:27,1
1,286,1014,5,1997-11-17 15:38:45,1
2,200,222,5,1997-10-05 09:05:40,1
3,224,29,3,1998-02-21 23:40:57,1
4,122,387,5,1997-11-11 17:47:39,1


#### Print Final Matrix Shapes

In [40]:
print(ratings_explicit.user_id.nunique(), ratings_explicit.item_id.nunique())
print(ratings_implicit.user_id.nunique(), ratings_implicit.item_id.nunique())

print(ratings_explicit.shape)
print(ratings_implicit.shape)

943 1682
943 1483
(100000, 5)
(54194, 5)


#### Create (Training, Validation) Interactions for Evaluation

In [41]:
print(ratings_implicit.shape)
ratings_implicit.head()

(54194, 5)


Unnamed: 0,user_id,item_id,rating,timestamp,positive_feedback
0,253,465,5,1998-04-03 18:34:27,1
1,286,1014,5,1997-11-17 15:38:45,1
2,200,222,5,1997-10-05 09:05:40,1
3,224,29,3,1998-02-21 23:40:57,1
4,122,387,5,1997-11-11 17:47:39,1


In [42]:
test_pct = 0.25

train_mask = (ratings_implicit.index / ratings_implicit.shape[0]) <  (1 - test_pct)
valid_mask = (ratings_implicit.index / ratings_implicit.shape[0]) >= (1 - test_pct)

interactions_train = ratings_implicit[train_mask][['user_id', 'item_id']]
interactions_valid = ratings_implicit[valid_mask][['user_id', 'item_id']]

print("train shape: {}".format(interactions_train.shape))
print("valid shape: {}".format(interactions_valid.shape))

train shape: (40646, 2)
valid shape: (13548, 2)


### Test Out Core Package Functionality

#### Initialize the Model

In [17]:
model = RankFM(factors=10, learning_rate=0.1, regularization=0.01, sigma=0.1)
model

<rankfm.RankFM at 0x1a163cecc0>

#### Fit the Model on the Training Data and Profile Computational Performance

In [18]:
%%prun -l 10 

model.fit(interactions_train, epochs=10, verbose=True)


training epoch: 0
penalized log-likelihood: -15659.78

training epoch: 1
penalized log-likelihood: -14657.63

training epoch: 2
penalized log-likelihood: -13993.72

training epoch: 3
penalized log-likelihood: -12926.64

training epoch: 4
penalized log-likelihood: -11544.56

training epoch: 5
penalized log-likelihood: -10646.9

training epoch: 6
penalized log-likelihood: -9839.49

training epoch: 7
penalized log-likelihood: -9117.44

training epoch: 8
penalized log-likelihood: -8480.68

training epoch: 9
penalized log-likelihood: -8051.57
 

         2786735 function calls (2785466 primitive calls) in 26.624 seconds

   Ordered by: internal time
   List reduced from 760 to 10 due to restriction <10>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
     1344   12.659    0.009   12.659    0.009 {method 'acquire' of '_thread.lock' objects}
   406460    8.952    0.000   10.289    0.000 rankfm.py:128(_gradient_step)
   812920    1.682    0.000    2.441    0.000 rankfm.py:116(_pairwise_utility)
   406470    1.393    0.000    2.497    0.000 rankfm.py:165(<genexpr>)
   812920    0.759    0.000    0.759    0.000 {built-in method numpy.dot}
        1    0.289    0.289   26.620   26.620 rankfm.py:170(fit)
      360    0.138    0.000    0.138    0.000 {built-in method posix.waitpid}
       10    0.135    0.014    0.138    0.014 pool.py:397(_map_async)
       31    0.100    0.003    2.597    0.084 {built-in method builtins.sum}
       80    0.087    0.001    0.112    0.001 {built-in method posix.fork}

#### Evaluate Model Performance on the Validation Data

In [46]:
interactions_valid.head()

Unnamed: 0,user_id,item_id
40646,437,443
40647,712,1037
40648,903,1070
40649,151,629
40650,711,79


In [47]:
k = 10

In [48]:
model_pre = round(precision_at_k(model, interactions_valid, k=k), 3)
model_rec = round(recall_at_k(model, interactions_valid, k=k), 3)
print("model precision: {} model recall: {}".format(model_pre, model_rec))

model precision: 0.119 model recall: 0.129


#### Evaluate Pure Popularity Baseline

In [49]:
most_popular = interactions_train.groupby('item_id').size().sort_values(ascending=False)[:k]
most_popular

item_id
50     359
100    292
181    277
258    261
98     249
174    246
127    246
1      226
56     223
313    222
dtype: int64

In [50]:
test_user_items = interactions_valid.groupby('user_id')['item_id'].apply(set).to_dict()
test_recommends = model.recommend_for_users(list(test_user_items.keys()))
test_user_items = {key: val for key, val in test_user_items.items() if key in test_recommends.index}

base_pre = round(np.mean([len(set(most_popular.index) & set(val)) / len(set(most_popular.index)) for key, val in test_user_items.items()]), 3)
base_rec = round(np.mean([len(set(most_popular.index) & set(val)) / len(set(val))                for key, val in test_user_items.items()]), 3)

print("number of test users: {}".format(len(test_user_items)))
print("baseline precision: {} baseline recall: {}".format(base_pre, base_rec))

number of test users: 905
baseline precision: 0.098 baseline recall: 0.087


#### Spot-Check Some User Recommendations

In [56]:
test_users = list(test_user_items.keys())
recommendations = model.recommend_for_users(test_users, n_items=10)
recommendations.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
1,50,56,183,179,127,195,12,168,55,98
2,258,286,300,313,315,269,268,301,328,306
3,690,328,292,333,315,313,300,306,301,258
4,286,269,258,268,300,315,313,333,328,301
5,210,183,195,11,588,204,403,746,96,156
6,191,170,357,483,474,127,511,199,479,427
7,183,195,210,177,423,432,511,185,22,96
8,210,183,22,423,50,204,127,174,172,195
9,127,50,100,286,56,98,275,12,7,269
10,511,191,474,479,357,483,170,179,654,23


In [57]:
user_id = 6
user_recs = recommendations.loc[user_id]
user_item_names = item_names[item_names.item_id.isin(user_recs)].set_index('item_id').loc[user_recs]
user_item_names

Unnamed: 0_level_0,item_name
item_id,Unnamed: 1_level_1
191,Amadeus (1984)
170,Cinema Paradiso (1988)
357,One Flew Over the Cuckoo's Nest (1975)
483,Casablanca (1942)
474,Dr. Strangelove or: How I Learned to Stop Worr...
127,"Godfather, The (1972)"
511,Lawrence of Arabia (1962)
199,"Bridge on the River Kwai, The (1957)"
479,Vertigo (1958)
427,To Kill a Mockingbird (1962)


### Look at the Movies with the Highest/Lowest Model Weights

In [58]:
item_weights = pd.DataFrame({'item_idx': np.arange(len(model.w_item)), 'w_item': model.w_item})
item_weights['item_id'] = item_weights['item_idx'].map(model.index_to_item)
item_weights = pd.merge(item_weights, item_names, on='item_id').sort_values('w_item', ascending=False)
item_weights = item_weights[['item_id', 'item_idx', 'item_name', 'w_item']]

#### Best Movies Ever?

In [59]:
item_weights.head(10)

Unnamed: 0,item_id,item_idx,item_name,w_item
49,50,49,Star Wars (1977),2.62524
124,127,124,"Godfather, The (1972)",2.505399
99,100,99,Fargo (1996),2.338529
6,7,6,Twelve Monkeys (1995),2.241892
255,258,255,Contact (1997),2.203237
11,12,11,"Usual Suspects, The (1995)",2.170979
55,56,55,Pulp Fiction (1994),2.166616
21,22,21,Braveheart (1995),2.166275
283,286,283,"English Patient, The (1996)",2.163986
97,98,97,"Silence of the Lambs, The (1991)",2.140684


#### Worst Movies Ever?

In [60]:
item_weights.tail(10)

Unnamed: 0,item_id,item_idx,item_name,w_item
1412,1639,1412,Bitter Sugar (Azucar Amargo) (1996),-1.735323
1293,1410,1293,Harlem (1993),-1.748674
435,446,435,Burnt Offerings (1976),-1.749218
73,74,73,Faster Pussycat! Kill! Kill! (1965),-1.74933
685,700,685,Miami Rhapsody (1995),-1.750904
1408,1629,1408,Nico Icon (1995),-1.754974
1374,1536,1374,Aiqing wansui (1994),-1.757247
1195,1260,1195,Total Eclipse (1995),-1.761253
954,992,954,Head Above Water (1996),-1.771836
1416,1649,1416,"Big One, The (1997)",-1.777709


# Start Sandbox Code