### Import Required Packages and Set Options

#### Import Base Libraries

In [1]:
import os
import sys

import numpy as np
import pandas as pd
import multiprocessing as mp

import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime
from functools import partial

#### Put the Main Package Library on the PYTHONPATH

In [2]:
curdir = sys.path[0]
srcdir = os.path.join(os.path.split(curdir)[0], 'rankfm')
sys.path[0] = srcdir
srcdir

'/Users/ericlundquist/Repos/rankfm/rankfm'

#### Dynamically Re-Load all Package Modules on Execution

In [3]:
%load_ext autoreload
%autoreload 2

from rankfm import RankFM
from evaluation import precision_at_k, recall_at_k

#### Set File Path Constants

In [4]:
REPO_ROOT = os.path.split(srcdir)[0]
DATA_ROOT = os.path.join(REPO_ROOT, "data/ml-100k")
print("\n".join([REPO_ROOT, DATA_ROOT]))

/Users/ericlundquist/Repos/rankfm
/Users/ericlundquist/Repos/rankfm/data/ml-100k


### Prepare Example Data

#### Load Users Data

In [5]:
users_df = pd.read_csv(os.path.join(DATA_ROOT, "users.csv"))
users_df['agegroup'] = pd.cut(users_df['age'], [0, 30, 45, 100], right=False, labels=False)
users_df = users_df.drop(['age', 'zip_code'], axis=1)
users_df = pd.get_dummies(users_df, prefix_sep='__', columns=['agegroup', 'gender', 'occupation'])
users_df.mean()

user_id                      472.000000
agegroup__0                    0.433722
agegroup__1                    0.348887
agegroup__2                    0.217391
gender__F                      0.289502
gender__M                      0.710498
occupation__administrator      0.083775
occupation__artist             0.029692
occupation__doctor             0.007423
occupation__educator           0.100742
occupation__engineer           0.071050
occupation__entertainment      0.019088
occupation__executive          0.033934
occupation__healthcare         0.016967
occupation__homemaker          0.007423
occupation__lawyer             0.012725
occupation__librarian          0.054083
occupation__marketing          0.027572
occupation__none               0.009544
occupation__other              0.111347
occupation__programmer         0.069989
occupation__retired            0.014846
occupation__salesman           0.012725
occupation__scientist          0.032874
occupation__student            0.207847


#### Load Items Data

In [6]:
items_df = pd.read_csv(os.path.join(DATA_ROOT, "items.csv"))
item_names = items_df[['item_id', 'item_name']]
item_names.head()

Unnamed: 0,item_id,item_name
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [7]:
items_df = items_df.drop(['item_name', 'release_date'], axis=1)
items_df.columns = ['item_id'] + ["genre__{}".format(col) for col in items_df.columns[1:]]
items_df.mean()

item_id               841.500000
genre__action           0.149227
genre__adventure        0.080262
genre__animation        0.024970
genre__childrens        0.072533
genre__comedy           0.300238
genre__crime            0.064804
genre__documentary      0.029727
genre__drama            0.431034
genre__fantasy          0.013080
genre__film_noir        0.014269
genre__horror           0.054697
genre__musical          0.033294
genre__mystery          0.036266
genre__romance          0.146849
genre__scifi            0.060048
genre__thriller         0.149227
genre__war              0.042212
genre__western          0.016052
dtype: float64

#### Load Ratings Data

In [8]:
ratings_explicit = pd.read_csv(os.path.join(DATA_ROOT, "ratings.csv"))
ratings_explicit['timestamp'] = pd.to_datetime(ratings_explicit['unix_timestamp'], origin='unix', unit='s')
ratings_explicit['positive_feedback'] = ratings_explicit.groupby('user_id')['rating'].transform(lambda c: np.where(c > c.mean(), 1, 0))
ratings_explicit = ratings_explicit.drop('unix_timestamp', axis=1)
ratings_explicit.mean()

user_id              462.48475
item_id              425.53013
rating                 3.52986
positive_feedback      0.54194
dtype: float64

In [9]:
ratings_explicit.head()

Unnamed: 0,user_id,item_id,rating,timestamp,positive_feedback
0,196,242,3,1997-12-04 15:55:49,0
1,186,302,3,1998-04-04 19:22:22,0
2,22,377,1,1997-11-07 07:18:36,0
3,244,51,2,1997-11-27 05:02:03,0
4,166,346,1,1998-02-02 05:33:16,0


#### Generate Implicit Feedback Ratings Data

In [10]:
ratings_implicit = ratings_explicit[ratings_explicit.positive_feedback == 1].reset_index(drop=True)
ratings_implicit.head()

Unnamed: 0,user_id,item_id,rating,timestamp,positive_feedback
0,253,465,5,1998-04-03 18:34:27,1
1,286,1014,5,1997-11-17 15:38:45,1
2,200,222,5,1997-10-05 09:05:40,1
3,224,29,3,1998-02-21 23:40:57,1
4,122,387,5,1997-11-11 17:47:39,1


#### Print Final Matrix Shapes

In [11]:
print(ratings_explicit.user_id.nunique(), ratings_explicit.item_id.nunique())
print(ratings_implicit.user_id.nunique(), ratings_implicit.item_id.nunique())

print(ratings_explicit.shape)
print(ratings_implicit.shape)

943 1682
943 1483
(100000, 5)
(54194, 5)


#### Create (Training, Validation) Interactions for Evaluation

In [12]:
print(ratings_implicit.shape)
ratings_implicit.head()

(54194, 5)


Unnamed: 0,user_id,item_id,rating,timestamp,positive_feedback
0,253,465,5,1998-04-03 18:34:27,1
1,286,1014,5,1997-11-17 15:38:45,1
2,200,222,5,1997-10-05 09:05:40,1
3,224,29,3,1998-02-21 23:40:57,1
4,122,387,5,1997-11-11 17:47:39,1


In [13]:
test_pct = 0.25
drop_users = (2, 4)

train_mask = (ratings_implicit.index / ratings_implicit.shape[0]) <  (1 - test_pct)
valid_mask = (ratings_implicit.index / ratings_implicit.shape[0]) >= (1 - test_pct)

interactions_total = ratings_implicit[['user_id', 'item_id']]
interactions_train = ratings_implicit[train_mask][['user_id', 'item_id']]
interactions_valid = ratings_implicit[valid_mask][['user_id', 'item_id']]

# drop some users from the training data to test cold-start functionality
interactions_train = interactions_train[~interactions_train.user_id.isin(drop_users)]

train_users = np.sort(interactions_train.user_id.unique())
valid_users = np.sort(interactions_valid.user_id.unique())
cold_start_users = set(valid_users) - set(train_users)

print("total shape: {}".format(interactions_total.shape))
print("train shape: {}".format(interactions_train.shape))
print("valid shape: {}".format(interactions_valid.shape))
print("train users: {}".format(len(train_users)))
print("valid users: {}".format(len(valid_users)))
print("cold-start users: {}".format(cold_start_users))

total shape: (54194, 2)
train shape: (40598, 2)
valid shape: (13548, 2)
train users: 941
valid users: 905
cold-start users: {2, 4}


In [14]:
iteraction_users = interactions_train.user_id.unique()
iteraction_items = interactions_train.item_id.unique()

#### Create User/Item Features Data for Testing

In [15]:
user_features = users_df[users_df.user_id.isin(iteraction_users)]
print(user_features.shape, users_df.shape)
user_features.head()

(941, 27) (943, 27)


Unnamed: 0,user_id,agegroup__0,agegroup__1,agegroup__2,gender__F,gender__M,occupation__administrator,occupation__artist,occupation__doctor,occupation__educator,...,occupation__marketing,occupation__none,occupation__other,occupation__programmer,occupation__retired,occupation__salesman,occupation__scientist,occupation__student,occupation__technician,occupation__writer
0,1,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,3,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,5,0,1,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
5,6,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,7,0,0,1,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
item_features = items_df[items_df.item_id.isin(iteraction_items)]
print(item_features.shape, items_df.shape)
item_features.head()

(1421, 19) (1682, 19)


Unnamed: 0,item_id,genre__action,genre__adventure,genre__animation,genre__childrens,genre__comedy,genre__crime,genre__documentary,genre__drama,genre__fantasy,genre__film_noir,genre__horror,genre__musical,genre__mystery,genre__romance,genre__scifi,genre__thriller,genre__war,genre__western
0,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,4,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


### Test Out Core Package Functionality

#### Initialize the Model

In [17]:
model = RankFM(factors=10, learning_rate=0.1, regularization=0.01, sigma=0.1)
model

<rankfm.RankFM at 0x1a23492cf8>

#### Initialize and Inspect Internal Data

In [18]:
%%time
model._init_all(interactions_train, user_features, item_features)

CPU times: user 594 ms, sys: 31.6 ms, total: 625 ms
Wall time: 666 ms


  d[key] = value


In [19]:
print(len(model.user_id), len(model.user_idx))
print(len(model.item_id), len(model.item_idx))

941 941
1421 1421


In [20]:
print(model.interactions.shape)
print(model.x_uf.shape)
print(model.x_if.shape)

(40598, 2)
(941, 26)
(1421, 18)


In [21]:
print("item weights: {}".format(model.w_i.shape))
print("item feature weights: {}".format(model.w_if.shape))
print("user factors: {}".format(model.v_u.shape))
print("item factors: {}".format(model.v_i.shape))
print("user feature factors: {}".format(model.v_uf.shape))
print("item feature factors: {}".format(model.v_if.shape))

item weights: (1421,)
item feature weights: (18,)
user factors: (941, 10)
item factors: (1421, 10)
user feature factors: (26, 10)
item feature factors: (18, 10)


In [22]:
model.interactions.dtype

dtype('int32')

In [23]:
uf_checker = pd.DataFrame(model.x_uf, columns=user_features.columns[1:])
uf_checker.head()

Unnamed: 0,agegroup__0,agegroup__1,agegroup__2,gender__F,gender__M,occupation__administrator,occupation__artist,occupation__doctor,occupation__educator,occupation__engineer,...,occupation__marketing,occupation__none,occupation__other,occupation__programmer,occupation__retired,occupation__salesman,occupation__scientist,occupation__student,occupation__technician,occupation__writer
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
user_features[user_features.user_id.isin(pd.Series([0,1,2,3,4]).map(model.index_to_user))]

Unnamed: 0,user_id,agegroup__0,agegroup__1,agegroup__2,gender__F,gender__M,occupation__administrator,occupation__artist,occupation__doctor,occupation__educator,...,occupation__marketing,occupation__none,occupation__other,occupation__programmer,occupation__retired,occupation__salesman,occupation__scientist,occupation__student,occupation__technician,occupation__writer
0,1,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,3,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,5,0,1,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
5,6,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,7,0,0,1,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
if_checker = pd.DataFrame(model.x_if, columns=item_features.columns[1:])
if_checker.head()

Unnamed: 0,genre__action,genre__adventure,genre__animation,genre__childrens,genre__comedy,genre__crime,genre__documentary,genre__drama,genre__fantasy,genre__film_noir,genre__horror,genre__musical,genre__mystery,genre__romance,genre__scifi,genre__thriller,genre__war,genre__western
0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [26]:
item_features[item_features.item_id.isin(pd.Series([0,1,2,3,4]).map(model.index_to_item))]

Unnamed: 0,item_id,genre__action,genre__adventure,genre__animation,genre__childrens,genre__comedy,genre__crime,genre__documentary,genre__drama,genre__fantasy,genre__film_noir,genre__horror,genre__musical,genre__mystery,genre__romance,genre__scifi,genre__thriller,genre__war,genre__western
0,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,4,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


In [193]:
model = RankFM(factors=10, learning_rate=0.10, regularization=0.01, sigma=0.1)
model

<rankfm.RankFM at 0x1a260e1080>

#### Fit the Model on the Training Data and Profile Computational Performance

In [194]:
%%time

# model.fit(interactions_train, item_features=item_features, epochs=50, verbose=True)
model.fit(interactions_train, epochs=25, verbose=True)


training epoch: 0
log likelihood: -19068.44

training epoch: 1
log likelihood: -15767.97

training epoch: 2
log likelihood: -15047.42

training epoch: 3
log likelihood: -14474.57

training epoch: 4
log likelihood: -13686.82

training epoch: 5
log likelihood: -12655.79

training epoch: 6
log likelihood: -11825.23

training epoch: 7
log likelihood: -10929.16

training epoch: 8
log likelihood: -10346.99

training epoch: 9
log likelihood: -9807.65

training epoch: 10
log likelihood: -9337.15

training epoch: 11
log likelihood: -9205.61

training epoch: 12
log likelihood: -8868.27

training epoch: 13
log likelihood: -8701.7

training epoch: 14
log likelihood: -8660.84

training epoch: 15
log likelihood: -8522.38

training epoch: 16
log likelihood: -8356.53

training epoch: 17
log likelihood: -8366.53

training epoch: 18
log likelihood: -8219.62

training epoch: 19
log likelihood: -8152.87

training epoch: 20
log likelihood: -7964.74

training epoch: 21
log likelihood: -7970.16

training ep

In [195]:
# model.fit_partial(interactions_train, epochs=5, verbose=True)

#### Generate Model Scores

In [196]:
interactions_total.shape

(54194, 2)

In [197]:
%%time
scores = model.predict(interactions_total, cold_start='nan') 

CPU times: user 70 ms, sys: 1.9 ms, total: 71.9 ms
Wall time: 71.4 ms


#### Generate TopN Recommendations

In [198]:
all_users = pd.Series(interactions_total.user_id.unique())
print(all_users.shape)
all_users.head(10)

(943,)


0    253
1    286
2    200
3    224
4    122
5    291
6    119
7    167
8    299
9    308
dtype: int64

In [199]:
%%time
top_n = model.recommend_for_users(all_users, n_items=10, filter_previous=False, cold_start='nan')

CPU times: user 496 ms, sys: 4.51 ms, total: 500 ms
Wall time: 126 ms


In [200]:
top_n.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
253,127.0,172.0,181.0,98.0,174.0,50.0,191.0,496.0,79.0,69.0
286,433.0,1.0,747.0,154.0,268.0,189.0,239.0,151.0,216.0,367.0
200,172.0,174.0,181.0,82.0,79.0,265.0,98.0,96.0,50.0,69.0
224,591.0,15.0,196.0,742.0,274.0,97.0,282.0,471.0,28.0,237.0
122,179.0,318.0,196.0,197.0,131.0,70.0,735.0,52.0,172.0,98.0
291,763.0,151.0,11.0,410.0,56.0,1.0,447.0,240.0,403.0,746.0
119,181.0,258.0,50.0,282.0,111.0,845.0,591.0,121.0,1.0,25.0
167,127.0,197.0,191.0,483.0,131.0,199.0,527.0,498.0,657.0,529.0
299,127.0,170.0,514.0,483.0,213.0,191.0,100.0,479.0,209.0,179.0
308,209.0,603.0,185.0,482.0,175.0,558.0,56.0,447.0,191.0,179.0


#### Evaluate Model Performance on the Validation Data

In [201]:
interactions_valid.head()

Unnamed: 0,user_id,item_id
40646,437,443
40647,712,1037
40648,903,1070
40649,151,629
40650,711,79


In [202]:
k = 10

In [203]:
%%time
model_pre = round(precision_at_k(model, interactions_valid, k=k), 3)
model_rec = round(recall_at_k(model, interactions_valid, k=k), 3)

CPU times: user 2.2 s, sys: 7.94 ms, total: 2.2 s
Wall time: 659 ms


In [204]:
print("model precision: {} model recall: {}".format(model_pre, model_rec))

model precision: 0.123 model recall: 0.134


#### Evaluate Pure Popularity Baseline

In [205]:
most_popular = interactions_train.groupby('item_id').size().sort_values(ascending=False)[:k]
most_popular

item_id
50     357
100    291
181    277
258    260
98     249
174    246
127    245
1      225
56     223
313    221
dtype: int64

In [206]:
test_user_items = interactions_valid.groupby('user_id')['item_id'].apply(set).to_dict()
test_recommends = model.recommend_for_users(list(test_user_items.keys()), cold_start='drop')
test_user_items = {key: val for key, val in test_user_items.items() if key in test_recommends.index}

base_pre = round(np.mean([len(set(most_popular.index) & set(val)) / len(set(most_popular.index)) for key, val in test_user_items.items()]), 3)
base_rec = round(np.mean([len(set(most_popular.index) & set(val)) / len(set(val))                for key, val in test_user_items.items()]), 3)

print("number of test users: {}".format(len(test_user_items)))
print("baseline precision: {} baseline recall: {}".format(base_pre, base_rec))

number of test users: 903
baseline precision: 0.098 baseline recall: 0.088


#### Generate Model Scores for Test User/Items

In [180]:
mask = (interactions_valid.user_id.isin(interactions_train.user_id)) & (interactions_valid.item_id.isin(interactions_train.item_id))
print(interactions_valid.shape)
print(interactions_valid[mask].shape)

(13548, 2)
(13469, 2)


In [181]:
scores_miss = model.predict(interactions_valid, cold_start='nan')
scores_drop = model.predict(interactions_valid, cold_start='drop')

print(scores_miss.shape)
print(scores_drop.shape)

(13548,)
(13469,)


In [182]:
print(scores_miss.mean(), scores_miss.std(), scores_miss.min(), scores_miss.max())
print(scores_drop.mean(), scores_drop.std(), scores_drop.min(), scores_drop.max())

nan nan nan nan
0.67234236 0.60539776 -1.4645457 1.7895787


#### Spot-Check Some User Recommendations

In [183]:
len(train_users), len(valid_users), cold_start_users

(941, 905, {2, 4})

In [184]:
recommendations = model.recommend_for_users(valid_users, n_items=10, filter_previous=True, cold_start='nan')
print(recommendations.shape)
recommendations.head(10)

(905, 10)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
1,172.0,427.0,69.0,318.0,204.0,8.0,357.0,655.0,483.0,97.0
2,,,,,,,,,,
3,333.0,288.0,302.0,294.0,310.0,258.0,292.0,326.0,286.0,272.0
4,,,,,,,,,,
5,56.0,174.0,210.0,195.0,22.0,82.0,64.0,69.0,181.0,196.0
6,172.0,654.0,8.0,79.0,603.0,182.0,69.0,210.0,179.0,496.0
7,50.0,69.0,318.0,173.0,82.0,12.0,357.0,202.0,1.0,197.0
8,98.0,1.0,64.0,100.0,69.0,204.0,121.0,168.0,186.0,7.0
9,100.0,302.0,56.0,258.0,127.0,98.0,172.0,475.0,427.0,174.0
10,172.0,427.0,98.0,168.0,174.0,474.0,64.0,175.0,195.0,79.0


In [185]:
recommendations[recommendations.isnull().any(axis=1)]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
2,,,,,,,,,,
4,,,,,,,,,,


In [191]:
user_id = 5
user_recs = recommendations.loc[user_id]
user_item_names = item_names[item_names.item_id.isin(user_recs)].set_index('item_id').loc[user_recs]
user_item_names

Unnamed: 0_level_0,item_name
item_id,Unnamed: 1_level_1
56.0,Pulp Fiction (1994)
174.0,Raiders of the Lost Ark (1981)
210.0,Indiana Jones and the Last Crusade (1989)
195.0,"Terminator, The (1984)"
22.0,Braveheart (1995)
82.0,Jurassic Park (1993)
64.0,"Shawshank Redemption, The (1994)"
69.0,Forrest Gump (1994)
181.0,Return of the Jedi (1983)
196.0,Dead Poets Society (1989)


#### Look at Similar Movies for a Few Recommended Movies

In [192]:
most_similar_items = model.similar_items(56.0)
most_similar_names = item_names.set_index('item_id').loc[most_similar_items]
most_similar_names

Unnamed: 0_level_0,item_name
item_id,Unnamed: 1_level_1
168,Monty Python and the Holy Grail (1974)
172,"Empire Strikes Back, The (1980)"
195,"Terminator, The (1984)"
82,Jurassic Park (1993)
64,"Shawshank Redemption, The (1994)"
202,Groundhog Day (1993)
204,Back to the Future (1985)
568,Speed (1994)
208,Young Frankenstein (1974)
496,It's a Wonderful Life (1946)


### Look at the Movies with the Highest/Lowest Model Weights

In [53]:
item_weights = pd.DataFrame({'item_idx': np.arange(len(model.w_i)), 'w_item': model.w_i})
item_weights['item_id'] = item_weights['item_idx'].map(model.index_to_item)
item_weights = pd.merge(item_weights, item_names, on='item_id').sort_values('w_item', ascending=False)
item_weights = item_weights[['item_id', 'item_idx', 'item_name', 'w_item']]

#### Best Movies Ever?

In [54]:
item_weights.head(10)

Unnamed: 0,item_id,item_idx,item_name,w_item
49,50,49,Star Wars (1977),2.312693
99,100,99,Fargo (1996),2.213715
124,127,124,"Godfather, The (1972)",2.178087
310,313,310,Titanic (1997),2.112038
255,258,255,Contact (1997),2.064679
283,286,283,"English Patient, The (1996)",2.05086
178,181,178,Return of the Jedi (1983),1.937805
97,98,97,"Silence of the Lambs, The (1991)",1.867586
21,22,21,Braveheart (1995),1.824096
171,174,171,Raiders of the Lost Ark (1981),1.785299


#### Worst Movies Ever?

In [55]:
item_weights.tail(10)

Unnamed: 0,item_id,item_idx,item_name,w_item
1295,1413,1295,Street Fighter (1994),-1.712822
536,548,536,"NeverEnding Story III, The (1994)",-1.715896
1169,1227,1169,"Awfully Big Adventure, An (1995)",-1.716949
442,453,442,Jaws 3-D (1983),-1.719923
1244,1324,1244,Loaded (1994),-1.728409
958,996,958,"Big Green, The (1995)",-1.729017
1383,1554,1383,Safe Passage (1994),-1.731241
1323,1458,1323,"Damsel in Distress, A (1937)",-1.744731
1261,1370,1261,I Can't Sleep (J'ai pas sommeil) (1994),-1.746733
1321,1452,1321,Lady of Burlesque (1943),-1.780282


# Start Sandbox Code

In [189]:
item_names[:20]

Unnamed: 0,item_id,item_name
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)
5,6,Shanghai Triad (Yao a yao yao dao waipo qiao) ...
6,7,Twelve Monkeys (1995)
7,8,Babe (1995)
8,9,Dead Man Walking (1995)
9,10,Richard III (1995)


In [190]:
def similar_items(item_id, n_items=10):
    """find the items most similar to a given item"""
    
    try:
        item_idx = item_to_index.loc[item_id]
    except (KeyError, TypeError):
        print("item_id={} not found in training data".format(item_id))
    
    lr_item = v_i[item_idx] + np.dot(v_if.T, x_if[item_idx])
    lr_all_items = v_i + np.dot(x_if, v_if)
    
    similarities = pd.Series(np.dot(lr_all_items, lr_item)).drop(item_idx).sort_values(ascending=False)[:n_items]
    most_similar = pd.Series(similarities.index).map(index_to_item)
    return most_similar

    
    
    
    

In [195]:

most_similar

0     201
1     184
2      68
3      53
4    1059
5     559
6     232
7      92
8     240
9     721
dtype: int64

Unnamed: 0_level_0,item_name
item_id,Unnamed: 1_level_1
201,Evil Dead II (1987)
184,Army of Darkness (1993)
68,"Crow, The (1994)"
53,Natural Born Killers (1994)
1059,Don't Be a Menace to South Central While Drink...
559,Interview with the Vampire (1994)
232,Young Guns (1988)
92,True Romance (1993)
240,Beavis and Butt-head Do America (1996)
721,Mallrats (1995)


In [179]:
x_if[i].shape

(18,)

In [131]:
x_if[i]

array([1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 1.,
       0.], dtype=float32)

In [159]:

lr_item.shape

(10,)

In [160]:

lr_all_items.shape

(1421, 10)

In [161]:
similarities

808    1.735169
354    1.606644
323    1.549694
743    1.522463
667    1.474427
364    1.455449
237    1.444095
268    1.421634
866    1.408358
351    1.395079
dtype: float32

In [162]:
most_similar_items = pd.Series(similarities.index).map(index_to_item)
most_similar_items

0    833
1    358
2    327
3    763
4    682
5    369
6    240
7    271
8    895
9    355
dtype: int64

In [163]:
item_names[item_names.item_id.isin(most_similar_items)]

Unnamed: 0,item_id,item_name
239,240,Beavis and Butt-head Do America (1996)
270,271,Starship Troopers (1997)
326,327,Cop Land (1997)
354,355,Sphere (1998)
357,358,Spawn (1997)
368,369,Black Sheep (1996)
681,682,I Know What You Did Last Summer (1997)
762,763,Happy Gilmore (1996)
832,833,Bulletproof (1996)
894,895,Scream 2 (1997)


In [94]:
x_if.shape, v_if.shape

((1421, 18), (18, 10))

In [95]:
test = 
test.shape

(1421, 10)

In [96]:
v_i.shape

(1421, 10)

In [167]:
item_to_index.loc[9999]

KeyError: 9999