In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 1. magic for inline plot
# 2. magic to print version
# 3. magic so that the notebook will reload external python modules
# 4. a ipython magic to enable retina (high resolution) plots
# https://gist.github.com/minrk/3301035
%matplotlib inline
%load_ext watermark
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'retina'

from time import time
from subprocess import call
from epsilon.utils import jit_toy_data

# no need to worry about this part, it makes
# subsequent model evaluation runs faster
jit_toy_data()

%watermark -a 'Ethen' -d -t -v -p numpy,pandas,matplotlib,epsilon

Ethen 2017-08-14 09:09:15 

CPython 3.5.2
IPython 5.4.1

numpy 1.13.1
pandas 0.20.2
matplotlib 2.0.2
epsilon 0.0.1


In [2]:
# download the data if it's not in the same local directory
file_dir = 'ml-100k'
file_path = os.path.join(file_dir, 'u.data')
if not os.path.isdir(file_dir):
    call(['curl', '-O', 'http://files.grouplens.org/datasets/movielens/' + file_dir + '.zip'])
    call(['unzip', file_dir + '.zip'])

In [3]:
from epsilon.utils import check_value_and_coltype


# pass the DataFrame and names of 
# the user, item and ratings columns
user_col = 'user_id'
item_col = 'item_id'
rating_col = 'rating'
timestamp_col = 'timestamp'

names = [user_col, item_col, rating_col, timestamp_col]
df = pd.read_csv(file_path, sep = '\t', names = names)

# this is simply a quick utility function to ensure
# the columns does not include nan values and convert
# type to string if they are not already
df = check_value_and_coltype(df, user_col, item_col)
print('data dimension: \n', df.shape)
df.head()

ImportError: cannot import name 'check_value_and_coltype'

In [None]:
from datetime import datetime


def extract_time(row):
    timestamp = datetime.utcfromtimestamp(row)
    time = str(timestamp.year) + ',' + str(timestamp.month)
    return time
    

df[timestamp_col] = df[timestamp_col].apply(extract_time)
df_time = (df[timestamp_col]
           .str.split(',', expand = True)
           .rename(columns = {0: 'year', 1: 'month'}))

for col in df_time.columns:
    df_time[col] = df_time[col].apply(int)

df = df.drop(timestamp_col, axis = 1)
df = pd.concat([df, df_time], axis = 1)

# mask is the watershed for the train/test data
mask = (df['year'] == 1998) & (df['month'] >= 4)
df_train = df[~mask]
df_test = df[mask]

print('training set dimension: ', df_train.shape)
print('testing set dimension: ', df_test.shape)
df_train.head()

In [None]:
from epsilon.utils import RecommenderMatrix


dtype = 'float32'
rating_threshold = 4
rec_matrix = RecommenderMatrix(
    user_col, item_col, rating_col, rating_threshold = rating_threshold, dtype = dtype)

X_train = rec_matrix.fit_transform(df_train)
X_test = rec_matrix.transform(df_test)
X_train

In [None]:
X_test

In [None]:
from epsilon.models import TensorflowBPR

bpr_params = {'n_factors': 10,
              'learning_rate': 0.1,
              'n_iters': 150,
              'reg': 0.01,
              'batch_size': 200}

bpr = TensorflowBPR(**bpr_params)
bpr.fit(X_train)

In [None]:
bpr_params = {'n_factors': 10,
              'learning_rate': 0.1,
              'n_iters': 150,
              'reg': 0.01,
              'batch_size': 200,
              'tensorboard': '.graphs/bpr1'}

bpr1 = TensorflowBPR(**bpr_params)
bpr1.fit(X_train)

In [None]:
from epsilon.metrics import auc_score, ndcg_score, map_score


bpr_auc_train = auc_score(bpr, X_train)
bpr_auc_test = auc_score(bpr, X_test)
print('auc training:', bpr_auc_train)
print('auc testing:', bpr_auc_test)

In [None]:
k = 5
bpr_ndcg_train = ndcg_score(bpr, X_train, k)
bpr_ndcg_test = ndcg_score(bpr, X_test, k)
print('ndcg training:', bpr_ndcg_train)
print('ndcg testing:', bpr_ndcg_test)

In [None]:
plt.rcParams['figure.figsize'] = 8, 6
plt.rcParams['font.size'] = 10

plt.plot(bpr.history_)
plt.title('Convergence Plot')
plt.xlabel('Iterations')
plt.ylabel('Cost')
plt.show()

In [None]:
from lightfm import LightFM
from joblib import cpu_count
from lightfm.evaluation import auc_score as auc_score1

model = LightFM(learning_rate = 0.01, loss = 'bpr')
start = time()
model.fit(X_train, epochs = 15, num_threads = cpu_count())
elapse = time() - start

print('elapse time:', elapse)
print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))

In [None]:
train_auc = auc_score1(model, X_train).mean()
test_auc = auc_score1(model, X_test).mean()

In [None]:
hi

In [None]:
!tensorboard --logdir='./graphs/bpr/' --port=8000

In [None]:
similar_items = bpr.get_similar_items(N = 5)
similar_items

In [None]:
recommendation = bpr.recommend(X_train, N = 5)
recommendation