In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 1. magic for inline plot
# 2. magic to print version
# 3. magic so that the notebook will reload external python modules
%matplotlib inline
%load_ext watermark
%load_ext autoreload
%autoreload 2

from time import time
from subprocess import call
from epsilon.utils import jit_toy_data

# no need to worry about this part, it makes
# subsequent model evaluation runs faster
jit_toy_data()

%watermark -a 'Ethen' -d -t -v -p numpy,pandas,matplotlib,epsilon

  from ._conv import register_converters as _register_converters


Ethen 2018-02-25 21:30:44 

CPython 3.6.3
IPython 6.1.0

numpy 1.14.1
pandas 0.22.0
matplotlib 2.1.0
epsilon 0.0.1


In [2]:
# download the data if it's not in the same local directory
file_dir = 'ml-100k'
file_path = os.path.join(file_dir, 'u.data')
if not os.path.isdir(file_dir):
    call(['curl', '-O', 'http://files.grouplens.org/datasets/movielens/' + file_dir + '.zip'])
    call(['unzip', file_dir + '.zip'])

In [3]:
# pass the DataFrame and names of 
# the user, item and ratings columns
dtype = 'float64'
user_col = 'user_id'
item_col = 'item_id'
rating_col = 'rating'
timestamp_col = 'timestamp'

names = [user_col, item_col, rating_col, timestamp_col]
df = pd.read_csv(file_path, sep = '\t', names = names)
print('data dimension: \n', df.shape)
df.head()

data dimension: 
 (100000, 4)


Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [4]:
from datetime import datetime

def extract_time(row):
    timestamp = datetime.utcfromtimestamp(row)
    time = str(timestamp.year) + ',' + str(timestamp.month)
    return time
    

df[timestamp_col] = df[timestamp_col].apply(extract_time)
df_time = (df[timestamp_col]
           .str.split(',', expand = True)
           .rename(columns = {0: 'year', 1: 'month'}))

for col in df_time.columns:
    df_time[col] = df_time[col].apply(int)

df = df.drop(timestamp_col, axis = 1)
df = pd.concat([df, df_time], axis = 1)

# mask is the watershed for the train/test data
mask = (df['year'] == 1998) & (df['month'] >= 4)
df_train = df[~mask]
df_test = df[mask]

print('training set dimension: ', df_train.shape)
print('testing set dimension: ', df_test.shape)
df_train.head()

training set dimension:  (90641, 5)
testing set dimension:  (9359, 5)


Unnamed: 0,user_id,item_id,rating,year,month
0,196,242,3,1997,12
2,22,377,1,1997,11
3,244,51,2,1997,11
4,166,346,1,1998,2
5,298,474,4,1998,1


In [5]:
from epsilon.utils import RecommenderMatrix

rec_matrix = RecommenderMatrix(
    user_col, item_col, rating_col, dtype, user_threshold = 0)
X_train = rec_matrix.fit_transform(df_train)
X_train

<869x1639 sparse matrix of type '<class 'numpy.float64'>'
	with 90641 stored elements in Compressed Sparse Row format>

In [8]:
1.0 / (1.0 + np.exp(-3))

0.9525741268224334

In [6]:
1.0 / (1.0 + np.exp(3))

0.04742587317756678

In [7]:
Cui = X_train.tocoo()
n_users, n_items = Cui.shape

n_factors = 10
rstate = np.random.RandomState(1234)
item_factors = rstate.rand(n_items, n_factors + 1)
user_factors = rstate.rand(n_users, n_factors + 1)
user_factors[:, n_factors] = 1.0
