In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 1. magic for inline plot
# 2. magic to print version
# 3. magic so that the notebook will reload external python modules
%matplotlib inline
%load_ext watermark
%load_ext autoreload
%autoreload 2

from time import time
from subprocess import call
from epsilon.utils import jit_toy_data

# no need to worry about this part, it makes
# subsequent model evaluation runs faster
jit_toy_data()

%watermark -a 'Ethen' -d -t -v -p numpy,pandas,matplotlib,epsilon

Ethen 2017-07-24 16:55:59 

CPython 3.5.2
IPython 5.4.1

numpy 1.13.1
pandas 0.20.2
matplotlib 2.0.2
epsilon 0.0.1


In [2]:
# download the data if it's not in the same local directory
file_dir = 'ml-100k'
file_path = os.path.join(file_dir, 'u.data')
if not os.path.isdir(file_dir):
    call(['curl', '-O', 'http://files.grouplens.org/datasets/movielens/' + file_dir + '.zip'])
    call(['unzip', file_dir + '.zip'])

In [3]:
from epsilon.utils import check_value_and_coltype


# pass the DataFrame and names of 
# the user, item and ratings columns
dtype = 'float32'
user_col = 'user_id'
item_col = 'item_id'
rating_col = 'rating'
timestamp_col = 'timestamp'

names = [user_col, item_col, rating_col, timestamp_col]
df = pd.read_csv(file_path, sep = '\t', names = names)

# this is simply a quick utility function to ensure
# the columns does not include nan values and convert
# type to string if they are not already
df = check_value_and_coltype(df, user_col, item_col)
print('data dimension: \n', df.shape)
df.head()

data dimension: 
 (100000, 4)


Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [4]:
from datetime import datetime


def extract_time(row):
    timestamp = datetime.utcfromtimestamp(row)
    time = str(timestamp.year) + ',' + str(timestamp.month)
    return time
    

df[timestamp_col] = df[timestamp_col].apply(extract_time)
df_time = (df[timestamp_col]
           .str.split(',', expand = True)
           .rename(columns = {0: 'year', 1: 'month'}))

for col in df_time.columns:
    df_time[col] = df_time[col].apply(int)

df = df.drop(timestamp_col, axis = 1)
df = pd.concat([df, df_time], axis = 1)

# mask is the watershed for the train/test data
mask = (df['year'] == 1998) & (df['month'] >= 4)
df_train = df[~mask]
df_test = df[mask]

print('training set dimension: ', df_train.shape)
print('testing set dimension: ', df_test.shape)
df_train.head()

training set dimension:  (90641, 5)
testing set dimension:  (9359, 5)


Unnamed: 0,user_id,item_id,rating,year,month
0,196,242,3,1997,12
2,22,377,1,1997,11
3,244,51,2,1997,11
4,166,346,1,1998,2
5,298,474,4,1998,1


In [5]:
from epsilon.utils import RecommenderMatrix

rec_matrix = RecommenderMatrix(user_col, item_col, rating_col, dtype)
X_train = rec_matrix.fit_transform(df_train)
X_test = rec_matrix.transform(df_test)
X_train

<868x1639 sparse matrix of type '<class 'numpy.float32'>'
	with 90640 stored elements in Compressed Sparse Row format>

In [6]:
X_test

<868x1639 sparse matrix of type '<class 'numpy.float32'>'
	with 2238 stored elements in Compressed Sparse Row format>

In [9]:
from bpr1 import BPR

bpr_params = {
    'n_factors': 20,
    'learning_rate': 0.01,
    'n_iters': 12,
    'reg': 0.01,
    'batch_size': 2000 
}

bpr = BPR(**bpr_params)
bpr.fit(X_train)



BPR:   0%|          | 0/12 [00:00<?, ?it/s][A[A


BPR:   8%|▊         | 1/12 [00:04<00:54,  4.96s/it][A[A

BPR:  17%|█▋        | 2/12 [00:06<00:39,  3.91s/it][A[A

BPR:  25%|██▌       | 3/12 [00:07<00:28,  3.17s/it][A[A

BPR:  33%|███▎      | 4/12 [00:09<00:21,  2.66s/it][A[A

BPR:  42%|████▏     | 5/12 [00:10<00:16,  2.30s/it][A[A

BPR:  50%|█████     | 6/12 [00:12<00:12,  2.05s/it][A[A

BPR:  58%|█████▊    | 7/12 [00:13<00:09,  1.87s/it][A[A

BPR:  67%|██████▋   | 8/12 [00:15<00:06,  1.74s/it][A[A

BPR:  75%|███████▌  | 9/12 [00:16<00:04,  1.64s/it][A[A

BPR:  83%|████████▎ | 10/12 [00:18<00:03,  1.59s/it][A[A

BPR:  92%|█████████▏| 11/12 [00:19<00:01,  1.56s/it][A[A

BPR: 100%|██████████| 12/12 [00:20<00:00,  1.52s/it][A[A

[A[A

BPR(batch_size=2000, learning_rate=0.01, n_factors=20, n_iters=12,
  random_state=1234, reg=0.01, tensorboard='./graphs/bpr', verbose=True)

In [None]:
from epsilon.metrics import auc_score, ndcg_score, map_score


bpr_auc_train = auc_score(bpr, X_train)
bpr_auc_test = auc_score(bpr, X_test)
print('auc training:', bpr_auc_train)
print('auc testing:', bpr_auc_test)

In [None]:
k = 5
bpr_ndcg_train = ndcg_score(bpr, X_train, k)
bpr_ndcg_test = ndcg_score(bpr, X_test, k)
print('ndcg training:', bpr_ndcg_train)
print('ndcg testing:', bpr_ndcg_test)

In [None]:
plt.rcParams['figure.figsize'] = 8, 6
plt.rcParams['font.size'] = 10

plt.plot(bpr.history_)
plt.title('Convergence Plot')
plt.xlabel('Iterations')
plt.ylabel('Cost')
plt.show()

In [None]:
!tensorboard --logdir='./graphs/bpr/' --port=8000

In [None]:
hi

In [None]:
similar_items = bpr.get_similar_items(N = 5)
similar_items

In [None]:
recommendation = bpr.recommend(X_train, N = 5)
recommendation