In [None]:
# Notebook: BookRecommender-SVD
# Author: George Tohme
# Date: Sep 24, 2018

In [None]:
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

from scipy.sparse.linalg import svds

from math import sqrt

from IPython.display import SVG, display, HTML

import keras
from keras.models import Model
from keras.layers import Input, Embedding
from keras.layers.core import Flatten, Dense, Dropout
from keras.regularizers import l2
from keras.optimizers import Adam, SGD
from keras.layers.merge import dot, add, concatenate
from keras.constraints import non_neg

from keras.utils.vis_utils import model_to_dot
from keras.utils import plot_model

In [None]:
# Constants
MM_LATENT_FACTORS = 3

TRAIN_EPOCHS = 100
TRAIN_VERBOSE = 1

TRAIN_SPLIT_RATIO = 0.2

NN_USER_LATENT_FACTORS = 5
NN_MOVIE_LATENT_FACTORS = 8
NN_TRAIN_EPOCHS = 100

# Non-negative Matrix factorisation
NNMF = True

SGD_OPTIMIZER = True

# Adam Parameters
Adam_lr=0.0001
Adam_beta_1=0.9
Adam_beta_2=0.999
Adam_epsilon=None
Adam_decay=0.0
Adam_amsgrad=False

#SGD Parameters
SGD_lr=0.02
SGD_momentum=0.0
SGD_decay=0.0
SGD_nesterov=False

In [None]:
impressionRatings = {
  "dislike" : 0,
  "view" : 1,
  "interact" : 2,
  "like" : 3,
  "add to cart" : 4,
  "checkout" : 5
}

In [None]:
from IPython.display import display_html
def displaySideBySide(*args):
    htmlStr=''
    for df in args:
        htmlStr += df.to_html()
    display_html(htmlStr.replace('table','table style="display:inline"'), raw=True)
#enddef

In [None]:
usersDataFrame = pd.read_csv('../Data/Users.csv', encoding = 'latin-1')
booksDataFrame = pd.read_csv('../Data/Books.csv', encoding = 'latin-1')
eventsDataFrame = pd.read_csv('../Data/UserEvents.csv', encoding = 'latin-1')

In [None]:
len(usersDataFrame.index), len(booksDataFrame.index), len(eventsDataFrame.index)

In [None]:
eventsDataFrame = eventsDataFrame[0:100000]

In [None]:
eventsDataFrame.head()

In [None]:
# Create a dictionary to build a unique sequencial indexing of users and movies 
def idToIndex(idsList):
    idToIndexDict = {str(oldId): index for index, oldId in enumerate(idsList)}
    indexToIdDict = {str(index): oldId for index, oldId in zip(idToIndexDict.values(), idToIndexDict.keys())}
    return idToIndexDict, indexToIdDict
#enddef

In [None]:
eventsDataFrame.drop(eventsDataFrame.columns[0], axis=1, inplace=True)
eventsDataFrame.rename(columns={"user": "userId"}, inplace = True)
eventsDataFrame = pd.concat([eventsDataFrame, pd.get_dummies(eventsDataFrame['impression'], prefix='impression')],axis=1)

In [None]:
userIdToIndex, userIndexToId = idToIndex(list(eventsDataFrame['userId'].unique()))
bookIdToIndex, bookIndexToId = idToIndex(list(eventsDataFrame['bookId'].unique()))

# Add two unique index columns to the DataFrame
eventsDataFrame['userIDX'] = eventsDataFrame['userId'].apply(lambda Id: userIdToIndex[str(Id)])
eventsDataFrame['bookIDX'] = eventsDataFrame['bookId'].apply(lambda Id: bookIdToIndex[str(Id)])

In [None]:
# Convert the impression to a rating
eventsDataFrame["impressionRating"] = eventsDataFrame["impression"].map(impressionRatings)
eventsDataFrame.drop(eventsDataFrame.columns[[0,1,2,3,4,5,6,7,8]], axis=1, inplace=True)

In [None]:
usersDataFrame.head()

In [None]:
booksDataFrame.head()

In [None]:
eventsDataFrame.head()

In [None]:
# Inspect the crosstab of the top users/movies ratings

g = eventsDataFrame.groupby('userId')['impression'].count()
topUsers=g.sort_values(ascending=False)[:20]

g = eventsDataFrame.groupby('bookId')['impression'].count()
topBooks=g.sort_values(ascending=False)[:20]

topRatings = eventsDataFrame.join(topUsers, rsuffix='_r', how='inner', on='userId')
topRatings = topRatings.join(topBooks, rsuffix='_r', how='inner', on='bookId')

pd.crosstab(topRatings.userId, topRatings.bookId, topRatings.impression, aggfunc=np.sum)

In [None]:
# dig into the data...
numOfUsers = len(eventsDataFrame['userIDX'].unique())
numOfBooks = len(eventsDataFrame['bookIDX'].unique())
dataSparcity = 100.0 - (100.0 * len(eventsDataFrame) / (numOfUsers * numOfBooks))

print("Number of users: " + str(numOfUsers))
print("Number of books: " + str(numOfBooks))
print("Matrix sparcity: {0:2.4f}%".format(dataSparcity))

In [None]:
trainDataFrame, testDataFrame = train_test_split(eventsDataFrame, test_size=TRAIN_SPLIT_RATIO)
#actualImpressions = testDataFrame[["impression_checkout", "impression_dislike", "impression_interact", "impression_like", "impression_view"]]
actualImpressionRatings = testDataFrame["impressionRating"]

In [None]:
trainDataFrame.head()

In [None]:
testDataFrame.head()

In [None]:
actualImpressionRatings.head()

#### ---------Matrix Multiplication (SVD) Implementation--------------

In [None]:
# Inspect the crosstab of the top users/movies ratings

g = eventsDataFrame.groupby('userId')['impressionRating'].count()
topUsers=g.sort_values(ascending=False)

g = eventsDataFrame.groupby('bookId')['impressionRating'].count()
topBooks=g.sort_values(ascending=False)

topRatings = eventsDataFrame.join(topUsers, rsuffix='_r', how='inner', on='userId')
topRatings = topRatings.join(topBooks, rsuffix='_r', how='inner', on='bookId')

pd.crosstab(topRatings.userId, topRatings.bookId, topRatings.impression, aggfunc=np.sum)

In [None]:
R_df = eventsDataFrame.pivot(index = 'userIDX', columns ='bookIDX', values = 'impressionRating').fillna(0)
R_df.head()



In [None]:

R = R_df.as_matrix()
user_ratings_mean = np.mean(R, axis = 1)
R_demeaned = R - user_ratings_mean.reshape(-1, 1)


U, sigma, Vt = svds(R_demeaned, k = 50)

sigma = np.diag(sigma)

#### ---------END NOTEBOOK--------------