In [1]:
# Notebook: BookRecommender
# Author: George Tohme
# Date: Sep 26, 2018

In [2]:
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

from scipy.sparse.linalg import svds

from math import sqrt

from IPython.display import SVG, display, HTML
from keras.utils.vis_utils import model_to_dot
from keras.utils import plot_model

import pydot

import keras

from keras.models import Model
from keras.layers import Input, Embedding
from keras.layers.core import Flatten, Dense, Dropout
from keras.layers.merge import dot, add, concatenate

from keras.regularizers import l2
from keras.optimizers import Adam, SGD
from keras.constraints import non_neg

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [77]:
# Constants

SVDS_k = 5
MAX_RECOMMENDATIONS = 10

MM_LATENT_FACTORS = 3
MM_BATCH_SIZE = 64
MM_TRAIN_EPOCHS = 50

TRAIN_VERBOSE = 1
TRAIN_SPLIT_RATIO = 0.2

NN_USER_LATENT_FACTORS = 5
NN_MOVIE_LATENT_FACTORS = 8
NN_TRAIN_EPOCHS = 50
NN_BATCH_SIZE = 64

# Non-negative Matrix factorisation
NNMF = True

SGD_OPTIMIZER = False

# Adam Parameters
Adam_lr=0.02
Adam_beta_1=0.9
Adam_beta_2=0.999
Adam_epsilon=None
Adam_decay=0.0
Adam_amsgrad=False

#SGD Parameters
SGD_lr=0.02
SGD_momentum=0.0
SGD_decay=0.0
SGD_nesterov=False

### General notes
- The quality of the provided data ...
- New user predictions: 'cold start problem'
- Not the most elegant code: inefficient use of memeory (lots of pd dataframes), df joinging/merging is a bit convoluted.


### Approach
- Since we don't have actual ratings of books by users but rather the interaction with the books,
I opted to treat the 6 types of interactions as ....

In [4]:
# Translate the events action to ratings. 
# That's the approach taken in this Notebook.

# Another idea is to one-hot encode the actions and then 
# treat the problem as a classification problem. 
# Have not tried this to see if it yields better predictions.

impressionRatings = {
  "dislike" : 0,
  "view" : 1,
  "interact" : 2,
  "like" : 3,
  "add to cart" : 4,
  "checkout" : 5
}

In [5]:
# A function to display tables & plots side-by-side

from IPython.display import display_html
def displaySideBySide(*args):
    htmlStr=''
    for df in args:
        htmlStr += df.to_html()
    display_html(htmlStr.replace('table','table style="display:inline"'), raw=True)
#enddef

#### Read and show the data...

In [6]:
usersDataFrame = pd.read_csv('../Data/Users.csv', encoding = 'latin-1')
booksDataFrame = pd.read_csv('../Data/Books.csv', encoding = 'latin-1')
eventsDataFrame = pd.read_csv('../Data/UserEvents.csv', encoding = 'latin-1')

In [7]:
# Preliminary dimensions
len(usersDataFrame.index), len(booksDataFrame.index), len(eventsDataFrame.index)

(100000, 149998, 400000)

In [8]:
usersDataFrame.head()

Unnamed: 0.1,Unnamed: 0,user,location,age
0,177121,177122.0,"winter park, florida, usa",50.0
1,174021,174022.0,"fairview park, ohio, usa",
2,179441,179442.0,"farmville, north carolina, usa",45.0
3,44391,44392.0,"weston, florida, usa",65.0
4,89250,89251.0,"duesseldorf, n/a, germany",


In [9]:
booksDataFrame.head()

Unnamed: 0.1,Unnamed: 0,bookISBN,bookName,author,yearOfPublication,publisher,urlId
0,58005,773730982,"Stretch, Swallow &amp; Stare",Veronika Martenova Charles,1999,Stoddart Kids,8040.0
1,158211,916620867,The Two Faces of Religion: A Psychiatrists View,N.S. Xavier,1987,Portals Pr,46584.0
2,127812,380978598,Roma Eterna,Robert Silverberg,2003,Eos,39681.0
3,47598,802727719,For Everything a Season: Simple Musings on Liv...,Philip Gulley,2001,Walker Large Print,29478.0
4,131723,446604178,"Mountain, Get Out of My Way: Life Lessons and ...",Montel Williams,1997,Warner Books (Mm),104573.0


In [10]:
eventsDataFrame.head()

Unnamed: 0.1,Unnamed: 0,user,bookId,impression
0,523113,126736,843946806,dislike
1,861298,208406,345353145,like
2,37104,8890,2020213508,add to cart
3,328497,78553,451402383,add to cart
4,121368,27875,307129659,add to cart


#### Preliminary cleanup...

In [11]:
# Rename the user column for consistency
usersDataFrame.rename(columns={"user": "userId"}, inplace = True)

In [12]:
# Rename the bookId column for consistency
booksDataFrame.rename(columns={"bookId": "bookISBN"}, inplace = True)

In [13]:
# Some housekeeping column names updates for consistency and clarity...
eventsDataFrame.rename(columns={"user": "userId"}, inplace = True)
eventsDataFrame.rename(columns={"bookId": "bookISBN"}, inplace = True)

In [14]:
# Convert the impression to a rating (think sales funnel ratings)
eventsDataFrame["impressionRating"] = eventsDataFrame["impression"].map(impressionRatings)

In [15]:
# Remove the old text impression columns and the first column (event row id)
eventsDataFrame.drop(eventsDataFrame.columns[0], axis=1, inplace=True)
eventsDataFrame.drop(eventsDataFrame.columns[2], axis=1, inplace=True)

#### Merge the dataframes and remove the nulls...

In [16]:
# Join the events DF to the books DF using the books' ISBN.
# We're doing this, to ensure we're counting the books and users 
# that exist in the other tables (files)

# Building the unique indices based on the full events file will 
# yield incorrect data and will cause the models to crash as some ids will 
# larger than what max count is.

filledEventsDataFrame = pd.merge(eventsDataFrame,
             booksDataFrame[['author', 'yearOfPublication', 'bookISBN']], 
             on='bookISBN', 
             how='inner')

In [17]:
# Remove all nulls...
filledEventsDataFrame = filledEventsDataFrame.dropna()

#### Create unique ids for come columns...

In [18]:
# Build an index for the books' authors' names
authors = filledEventsDataFrame['author'].unique()
authorsDict = {name: authorId for authorId, name in enumerate(authors)}

# add it to the books dataframe
booksDataFrame["authorIDX"] = booksDataFrame["author"].map(authorsDict)
# and to ....
filledEventsDataFrame["authorIDX"] = filledEventsDataFrame["author"].map(authorsDict)

In [19]:
# Create a book id based on the users in the events dataset
bookIds = filledEventsDataFrame["bookISBN"].unique()
bookIdsDict = {bookId: bookIDX for bookIDX, bookId in enumerate(bookIds)}

# add it to the books dataframe
booksDataFrame["bookIDX"] = booksDataFrame["bookISBN"].map(bookIdsDict)

# and to ...
filledEventsDataFrame["bookIDX"] = filledEventsDataFrame["bookISBN"].map(bookIdsDict)

In [20]:
# Create a unique user id based on the users in the events dataset
userIds = filledEventsDataFrame["userId"].unique()
userIdsDict = {userId: userIDX for userIDX, userId in enumerate(userIds)}

# Add it to the users dataframe
usersDataFrame["userIDX"] = usersDataFrame["userId"].map(userIdsDict)

# and to ...
filledEventsDataFrame["userIDX"] = filledEventsDataFrame["userId"].map(userIdsDict)

In [21]:
filledEventsDataFrame.head()

Unnamed: 0,userId,bookISBN,impressionRating,author,yearOfPublication,authorIDX,bookIDX,userIDX
0,850,3426616262,2,Alexander Solschenizyn,1999,0,0,0
1,177458,1565112318,2,Neil Gaiman,1997,1,1,1
2,151420,8445071416,4,J. R. R. Tolkien,1991,2,2,2
3,229501,8445071416,3,J. R. R. Tolkien,1991,2,2,3
4,93755,8445071416,4,J. R. R. Tolkien,1991,2,2,4


In [22]:
filledEventsDataFrame.shape

(41048, 8)

#### Ensure there are no nulls in the dataframe we will be using and save it (just in case) for offline inspection...

In [23]:
filledEventsDataFrame[filledEventsDataFrame.isnull().any(axis=1)]

Unnamed: 0,userId,bookISBN,impressionRating,author,yearOfPublication,authorIDX,bookIDX,userIDX


In [24]:
filledEventsDataFrame.to_csv("filledEventsDataFrame.csv")

#### Split the data into training and test sets...

In [25]:
# Do the split...
trainDataFrame, testDataFrame = train_test_split(filledEventsDataFrame, test_size=TRAIN_SPLIT_RATIO)

# Grab the actual impression to check the results later
actualImpressionRatings = testDataFrame["impressionRating"]

In [26]:
# dig into the data...
numOfUsers = len(filledEventsDataFrame['userIDX'].unique())
numOfBooks = len(filledEventsDataFrame['bookIDX'].unique())
dataSparcity = 100.0 - (100.0 * len(filledEventsDataFrame) / (numOfUsers * numOfBooks))

print(filledEventsDataFrame.shape)
print("Number of users: " + str(numOfUsers))
print("Number of books: " + str(numOfBooks))
print("Matrix sparcity: {0:2.4f}%".format(dataSparcity))

(41048, 8)
Number of users: 17252
Number of books: 21128
Matrix sparcity: 99.9887%


#### Show a crosstab of the data...

In [27]:
# Inspect the crosstab of the top users/books events (ratings)

g = filledEventsDataFrame.groupby('userIDX')['impressionRating'].count()
topUsers = g.sort_values(ascending=False)[:20]

g = filledEventsDataFrame.groupby('bookIDX')['impressionRating'].count()
topBooks = g.sort_values(ascending=False)[:20]

topRatings = filledEventsDataFrame.join(topUsers, how='inner', rsuffix='r_', on='userIDX')
topRatings = topRatings.join(topBooks, how='inner', rsuffix='r_', on='bookIDX')

pd.crosstab(topRatings.userIDX, topRatings.bookIDX, topRatings.impressionRating, aggfunc=np.sum)

bookIDX,30,34,41,46,60,63,87,156,171,206,292,420,472,518,774,869
userIDX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
21,3.0,,0.0,4.0,5.0,1.0,5.0,,,5.0,,3.0,3.0,,5.0,5.0
107,4.0,,,,5.0,,,5.0,,,,,,,,
260,4.0,,,,,,,4.0,,,,,,,,
414,,1.0,,2.0,,,,,,,,,,,,
520,,,1.0,,,,,,,,,,,,,
938,,,,,5.0,5.0,,,,,,,,,,
1156,,,,,5.0,,,5.0,5.0,,,5.0,,,4.0,5.0
1244,,,,,,2.0,,,,,5.0,,,1.0,,
1348,,,,,,,,,,,,,,,,4.0
3401,,,,,,,,,,,,3.0,,,,


#### Show the heads...

In [28]:
trainDataFrame.head()

Unnamed: 0,userId,bookISBN,impressionRating,author,yearOfPublication,authorIDX,bookIDX,userIDX
26960,203820,8437607000,2,Fernando De Rojas,1990,6651,9027,108
4400,243930,067153615X,0,Larry McMurtry,1995,393,416,2071
41587,78761,3717515640,2,Edgar Allan Poe,1999,39,20533,2417
42030,150917,3821805870,3,Walter KrÌ?å_mer,1998,14103,20952,14466
14821,224944,067172570X,2,Spike Lee,1990,2440,2990,8888


In [29]:
trainDataFrame.shape

(32838, 8)

In [30]:
testDataFrame.head()

Unnamed: 0,userId,bookISBN,impressionRating,author,yearOfPublication,authorIDX,bookIDX,userIDX
16272,274808,059047877X,2,Caroline B. Cooney,1994,2880,3583,6610
40262,63714,1840240075,5,Michael Sheard,1997,12524,19294,1377
34162,211919,1582970009,2,L. Peat O'Neil,2000,9922,14087,8078
27333,11676,1551661764,3,Tess Gerritsen,1996,225,9234,21
32118,97324,1586420054,3,ALBERTO MORAVIA,2000,8925,12520,2791


In [31]:
testDataFrame.shape

(8210, 8)

In [32]:
actualImpressionRatings.head()

16272    2
40262    5
34162    2
27333    3
32118    3
Name: impressionRating, dtype: int64

In [33]:
actualImpressionRatings.shape

(8210,)

In [34]:
filledEventsDataFrame.head()

Unnamed: 0,userId,bookISBN,impressionRating,author,yearOfPublication,authorIDX,bookIDX,userIDX
0,850,3426616262,2,Alexander Solschenizyn,1999,0,0,0
1,177458,1565112318,2,Neil Gaiman,1997,1,1,1
2,151420,8445071416,4,J. R. R. Tolkien,1991,2,2,2
3,229501,8445071416,3,J. R. R. Tolkien,1991,2,2,3
4,93755,8445071416,4,J. R. R. Tolkien,1991,2,2,4


# The models

#### ---------Matrix Multiplication - SVD Implementation (No Optimization)--------------



In [35]:
# Build the full ratings dataframe
R_DataFrame = filledEventsDataFrame.pivot(index = 'userIDX', 
                                          columns ='bookIDX', 
                                          values = 'impressionRating').fillna(0)

In [36]:
R_DataFrame.head()

bookIDX,0,1,2,3,4,5,6,7,8,9,...,21118,21119,21120,21121,21122,21123,21124,21125,21126,21127
userIDX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
R_DataFrame.shape

(17252, 21128)

In [53]:
# Convert it to a matrix (array)
R_Matrix = R_DataFrame.values

In [67]:
R_Matrix

array([[2., 0., 0., ..., 0., 0., 0.],
       [0., 2., 0., ..., 0., 0., 0.],
       [0., 0., 4., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [57]:
# TODO: Optimize  'k'

# Ideally, we could optimize svds by creating a training and validation set to find the best 'k'
# by automatically minimizing the resulting RMSE.

In [58]:
# TODO: Assess adding a bias

In [59]:
# In the interest of time, this is not done here. Instead, I checked the outside temperature and
# I picked k=20

In [78]:
# Crunch and factorize by calling on scipy...
U, sigma, Vt = svds(R_Matrix, SVDS_k)

In [79]:
# Convert sigma into a diag matrix to be used for predictions
sigma = np.diag(sigma)

In [80]:
# Recreate the ratings by multiplying the 3 matrices...
# allPredictedRatings = np.dot(np.dot(U, sigma), Vt) + meanUserRatings.reshape(-1, 1)
allPredictedRatings = np.dot(np.dot(U, sigma), Vt)

In [81]:
# Format into a dataframe and add column names
allPredictedRatingsDataFrame = pd.DataFrame(allPredictedRatings, columns = R_DataFrame.columns)

In [82]:
allPredictedRatingsDataFrame.head()

bookIDX,0,1,2,3,4,5,6,7,8,9,...,21118,21119,21120,21121,21122,21123,21124,21125,21126,21127
0,4.306365e-12,2.832825e-09,-7.648235e-08,-9.876823e-08,4.830706e-07,9.174246e-25,6.810681e-07,5.760333e-10,-3.551928e-12,-1.005191e-07,...,-2.200804e-24,6.776032e-26,4.148533e-12,3.7520990000000003e-25,1.133903e-09,3.049285e-09,-6.172012e-24,-8.500658e-12,3.632931e-09,-1.832396e-07
1,2.832825e-09,1.349648e-05,0.0001535091,0.0001725729,0.004028045,-3.470988e-21,0.0007621661,4.243305e-07,4.588225e-09,0.0003075493,...,3.942444e-21,3.421115e-21,3.147944e-08,-3.20463e-21,1.787481e-05,1.847411e-05,5.071039e-21,8.368714e-08,9.331402e-05,-0.0005540848
2,2.36462e-11,6.322654e-07,3.433917e-05,4.033372e-05,4.239208e-06,-9.146716e-22,9.331066e-05,2.606e-08,1.233809e-09,5.640192e-05,...,1.687256e-21,4.304113e-22,4.814888e-10,-7.162019e-22,6.661953e-07,4.007095e-07,3.213014e-21,1.791939e-08,8.667221e-06,-2.739575e-05
3,-2.274362e-10,4.655321e-07,3.188096e-05,3.786711e-05,2.894191e-05,-7.817741000000001e-22,3.222135e-05,-1.388779e-08,1.182792e-09,5.142231e-05,...,1.440632e-21,3.607813e-22,5.04003e-10,-5.959344e-22,6.507665e-07,3.546505e-07,2.8654309999999998e-21,1.454149e-08,7.311791e-06,-2.197356e-05
4,-2.436365e-10,3.752889e-07,3.157694e-05,3.737878e-05,-1.308001e-05,-8.712824e-22,2.891979e-05,-1.711829e-08,1.171272e-09,5.015856e-05,...,1.574885e-21,4.1483980000000002e-22,1.336553e-10,-6.588074e-22,5.065362e-07,1.930881e-07,3.15552e-21,1.423407e-08,6.736462e-06,-1.39736e-05


In [66]:
R_DataFrame.head()

bookIDX,0,1,2,3,4,5,6,7,8,9,...,21118,21119,21120,21121,21122,21123,21124,21125,21126,21127
userIDX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
def recommendBooks(predictionsDF, userIDX, booksDF, original_ratings_df, num_recommendations=5):
    
    # Get and sort the user's predictions
    sorted_user_predictions = (predictionsDF.iloc[userIDX]).sort_values(ascending=False)
    
    # Get the user's data and merge in the movie information.
    user_data = original_ratings_df[original_ratings_df.userIDX == (userIDX)]
    
    user_full = (user_data.merge(booksDF, how = 'left', left_on = 'bookIDX', right_on = 'bookIDX').
                     sort_values(['impressionRating'], ascending=False)
                 )

    print('User {0} has already rated {1} books.'.format(userIDX, user_full.shape[0]))
    print('Recommending the highest {0} predicted ratings of books not already rated.'.format(num_recommendations))

    
    # Recommend the highest predicted ratings of books the user hasn't interacted with yet
    recommendations = (booksDF[~booksDF['bookIDX'].isin(user_full['bookIDX'])].
                       merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',
                             left_on = 'bookIDX',
                             right_on = 'bookIDX').
                       rename(columns = {userIDX: 'Predictions'}).
                       sort_values('Predictions', ascending = False).
                       iloc[:num_recommendations, :-1]
                      )

    return user_full, recommendations
#enddef

In [None]:
already_rated, predictions = recommendBooks(allPredictedRatingsDataFrame, 
                                            10, 
                                            booksDataFrame, 
                                            filledEventsDataFrame, 
                                            MAX_RECOMMNEDATIONS)

In [None]:
already_rated

In [None]:
predictions

#### -----------------Verification of SVD using the Surprise library-----------------

In [83]:
# Prepare the data
data = filledEventsDataFrame.copy()

In [84]:
data.drop(columns=["userId", "bookISBN", "author", "yearOfPublication", "authorIDX"], inplace=True)

In [85]:
# Reorder the columns
data = data[["userIDX", "bookIDX", "impressionRating"]]

In [86]:
data.head()

Unnamed: 0,userIDX,bookIDX,impressionRating
0,0,0,2
1,1,1,2
2,2,2,4
3,3,2,3
4,4,2,4


In [87]:
data.shape

(41048, 3)

In [88]:
from surprise import SVD
from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise import NormalPredictor
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise.model_selection import GridSearchCV

In [89]:
reader = Reader(rating_scale=(0, 5))

# The columns must correspond to user id, item id and ratings (in that order).
surpData = Dataset.load_from_df(data[['userIDX', 'bookIDX', 'impressionRating']], reader)

In [90]:
trainset, testset = train_test_split(surpData, test_size=.25)

In [91]:
# Use the  SVD algorithm
algo = SVD()

In [92]:
# Train the algorithm on the trainset, and predict ratings for the testset
param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=5)

gs.fit(surpData)

# best RMSE score
print(gs.best_score['rmse'])

1.3073115390341032


In [None]:
# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

In [None]:
predictions = algo.test(testset)

In [None]:
# Then compute RMSE
accuracy.rmse(predictions)

In [None]:
pred = algo.predict('2', '2', r_ui = 4, verbose=True)

In [None]:
pred = algo.predict('2', '7005', r_ui = 5, verbose=True)

#### ---------MODEL 1: Simple Matrix Multiplication Implementation--------------

In [None]:
usersInput = keras.layers.Input(shape=[1],name='Users')
booksInput = keras.layers.Input(shape=[1], name='Books')

# check if we're enforcing non-negative Matrix Factorization
# if so, include the constraint
if NNMF:
    usersEmbedding = keras.layers.Embedding(input_dim=numOfUsers + 1, 
                                            output_dim=MM_LATENT_FACTORS, 
                                            name='UsersEmbedding',
                                           embeddings_constraint=non_neg())(usersInput)
    booksEmbedding = keras.layers.Embedding(input_dim=numOfBooks + 1, 
                                                output_dim=MM_LATENT_FACTORS, 
                                                name='BooksEmbedding', 
                                             embeddings_constraint=non_neg())(booksInput)
else:
    usersEmbedding = keras.layers.Embedding(input_dim=numOfUsers + 1, 
                                            output_dim=MM_LATENT_FACTORS, 
                                            name='UsersEmbedding')(usersInput)
    booksEmbedding = keras.layers.Embedding(input_dim=numOfBooks + 1, 
                                            output_dim=MM_LATENT_FACTORS, 
                                            name='BooksEmbedding')(booksInput)
    
    
usersVec = keras.layers.Reshape([MM_LATENT_FACTORS], name='UsersVec')(usersEmbedding)
booksVec = keras.layers.Reshape([MM_LATENT_FACTORS], name='BooksVec')(booksEmbedding)


dotProd = keras.layers.dot([usersVec, booksVec], axes=1, normalize=False, name='DotProduct')

matMulModel = keras.Model([usersInput, booksInput], dotProd)


if SGD_OPTIMIZER:
    optimizer = SGD(SGD_lr, SGD_momentum, SGD_decay, SGD_nesterov)
else:
    optimizer = Adam(Adam_lr, Adam_beta_1, Adam_beta_2, Adam_epsilon, Adam_decay, Adam_amsgrad)

matMulModel.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['mse', 'acc'])

matMulModel.summary()

In [None]:
plot_model(matMulModel, to_file='matMulModel.png', show_shapes=True)
SVG(model_to_dot(matMulModel).create(prog='dot', format='svg'))

In [None]:
callbacks = [keras.callbacks.EarlyStopping('val_loss', patience=3), 
             keras.callbacks.ModelCheckpoint('matMulModelWeights.h5', save_best_only=True)]

trainingHistory = matMulModel.fit([trainDataFrame.userIDX, 
                                   trainDataFrame.bookIDX], 
                                  trainDataFrame.impressionRating,
                                  validation_split=.1,  
                                  callbacks=callbacks,
                                  batch_size=MM_BATCH_SIZE,
                                  epochs=MM_TRAIN_EPOCHS, 
                                  verbose=TRAIN_VERBOSE)

In [None]:
pd.Series(trainingHistory.history)

In [None]:
pd.Series(trainingHistory.history['loss']).plot(logy=True, label="Train Loss")
pd.Series(trainingHistory.history['val_loss']).plot(logy=True, label="Val Loss")
plt.legend(loc='upper right')
plt.xlabel("Epoch")
plt.ylabel("Training Error")

In [None]:
# Make some preditions
matMulRatingsPredictions = matMulModel.predict([testDataFrame.userIDX, testDataFrame.bookIDX])
matMulRatingsPredictions = np.rint(matMulRatingsPredictions)

In [None]:
displaySideBySide(pd.DataFrame(matMulRatingsPredictions).head(100), pd.DataFrame(actualImpressionRatings.head(100)))

In [None]:
# How did we do?
print("MAE {0:2.2f}".format(mean_absolute_error(actualImpressionRatings, matMulRatingsPredictions)))
print("RMSE {0:2.2f}".format(sqrt(mean_squared_error(actualImpressionRatings, matMulRatingsPredictions))))

In [None]:
# Retrieve the learnt embeddings
booksLearntEmbeddings = matMulModel.get_layer(name='BooksEmbedding').get_weights()[0]
usersLearntEmbeddings = matMulModel.get_layer(name='UsersEmbedding').get_weights()[0]

In [None]:
displaySideBySide(pd.DataFrame(usersLearntEmbeddings).head(), pd.DataFrame(usersLearntEmbeddings).describe())

In [None]:
displaySideBySide(pd.DataFrame(booksLearntEmbeddings).head(), pd.DataFrame(booksLearntEmbeddings).describe())

#### ----------Model 2: Neural Net Implementation (adding depth)-------------

In [None]:
usersInput = keras.layers.Input(shape=[1],name='Users')
booksInput = keras.layers.Input(shape=[1], name='Books')

bias = 1

# check if we're enforcing non-negative Matrix Factorization
# if so, include the constraint
if NNMF:
    usersEmbedding = keras.layers.Embedding(input_dim=numOfUsers + 1, 
                                            output_dim=MM_LATENT_FACTORS, 
                                            name='UsersEmbedding',
                                           embeddings_constraint=non_neg())(usersInput)
    booksEmbedding = keras.layers.Embedding(input_dim=numOfBooks + 1, 
                                                output_dim=MM_LATENT_FACTORS, 
                                                name='BooksEmbedding', 
                                             embeddings_constraint=non_neg())(booksInput)
else:
    usersEmbedding = keras.layers.Embedding(input_dim=numOfUsers + 1, 
                                            output_dim=MM_LATENT_FACTORS, 
                                            name='UsersEmbedding')(usersInput)
    booksEmbedding = keras.layers.Embedding(input_dim=numOfBooks + 1, 
                                            output_dim=MM_LATENT_FACTORS, 
                                            name='BooksEmbedding')(booksInput)


userBias = keras.layers.Embedding(input_dim=numOfUsers + 1, 
                                  output_dim=bias,
                                  input_length=1,
                                  name="userBias")(usersInput)
bookBias = keras.layers.Embedding(input_dim=numOfBooks + 1, 
                                  output_dim=bias,
                                  input_length=1,
                                  name="bookBias")(booksInput)

userBias = keras.layers.Flatten()(userBias)
bookBias = keras.layers.Flatten()(bookBias)

usersVec = keras.layers.Flatten()(usersEmbedding)
booksVec = keras.layers.Flatten()(booksEmbedding)

dotProd = keras.layers.dot([usersVec, booksVec], axes=1, normalize=True, name='DotProduct')



concatVecs = keras.layers.Concatenate()([dotProd, userBias, bookBias])

d1 = keras.layers.Dense(128, activation='relu')(concatVecs)
d1_drop = keras.layers.Dropout(0.2, name='Dropout')(d1)

d2 = keras.layers.Dense(1)(d1)
d2_drop = keras.layers.Dropout(0.2, name='Dropout')(d2)

neuralNetModel = keras.Model([usersInput, booksInput], d2)


if SGD_OPTIMIZER:
    optimizer = SGD(SGD_lr, SGD_momentum, SGD_decay, SGD_nesterov)
else:
    optimizer = Adam(Adam_lr, Adam_beta_1, Adam_beta_2, Adam_epsilon, Adam_decay, Adam_amsgrad)

neuralNetModel.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['mse', 'acc'])

neuralNetModel.summary()

In [None]:
plot_model(neuralNetModel, to_file='neuralNetModel.png', show_shapes=True)
SVG(model_to_dot(neuralNetModel).create(prog='dot', format='svg'))

In [None]:
callbacks = [keras.callbacks.EarlyStopping('val_loss', patience=3), 
             keras.callbacks.ModelCheckpoint('neuralNetModelWeights.h5', save_best_only=True)]

trainingHistory = neuralNetModel.fit([trainDataFrame.userIDX, trainDataFrame.bookIDX], 
                                     trainDataFrame.impressionRating, 
                                     validation_split=.1, 
                                     callbacks=callbacks,
                                     batch_size=NN_BATCH_SIZE,
                                     epochs=NN_TRAIN_EPOCHS, 
                                     verbose=TRAIN_VERBOSE)

In [None]:
pd.Series(trainingHistory.history)

In [None]:
pd.Series(trainingHistory.history['loss']).plot(logy=True, label="Train Loss")
pd.Series(trainingHistory.history['val_loss']).plot(logy=True, label="Val Loss")
plt.legend(loc='upper right')
plt.xlabel("Epoch")
plt.ylabel("Training Error")

In [None]:
# Make some preditions
neuralNetRatingsPredictions = neuralNetModel.predict([testDataFrame.userIDX, 
                                                      testDataFrame.bookIDX])
neuralNetRatingsPredictions = np.rint(neuralNetRatingsPredictions)

In [None]:
displaySideBySide(pd.DataFrame(neuralNetRatingsPredictions).head(30), 
                  pd.DataFrame(actualImpressionRatings).head(30))

#### ----------Model 3: Neural Net Implementation (adding Bias and 2 features: Author & YearOfPublication)-------------

In [None]:
usersInput = keras.layers.Input(shape=[1],name='Users')
booksInput = keras.layers.Input(shape=[1], name='Books')
authorsInput = keras.layers.Input(shape=[1], name='Authors')
yearOfPubInput = keras.layers.Input(shape=[1], name='YearOfPub')

bias = 1

# check if we're enforcing non-negative Matrix Factorization
# if so, include the constraint
if NNMF:
    usersEmbedding = keras.layers.Embedding(input_dim=numOfUsers + 1, 
                                            output_dim=MM_LATENT_FACTORS, 
                                            name='UsersEmbedding',
                                           embeddings_constraint=non_neg())(usersInput)
    booksEmbedding = keras.layers.Embedding(input_dim=numOfBooks + 1, 
                                                output_dim=MM_LATENT_FACTORS, 
                                                name='BooksEmbedding', 
                                             embeddings_constraint=non_neg())(booksInput)
else:
    usersEmbedding = keras.layers.Embedding(input_dim=numOfUsers + 1, 
                                            output_dim=MM_LATENT_FACTORS, 
                                            name='UsersEmbedding')(usersInput)
    booksEmbedding = keras.layers.Embedding(input_dim=numOfBooks + 1, 
                                            output_dim=MM_LATENT_FACTORS, 
                                            name='BooksEmbedding')(booksInput)


userBias = keras.layers.Embedding(input_dim=numOfUsers + 1, 
                                  output_dim=bias,
                                  input_length=1,
                                  name="userBias")(usersInput)
bookBias = keras.layers.Embedding(input_dim=numOfBooks + 1, 
                                  output_dim=bias,
                                  input_length=1,
                                  name="bookBias")(booksInput)

userBias = keras.layers.Flatten()(userBias)
bookBias = keras.layers.Flatten()(bookBias)

usersVec = keras.layers.Flatten()(usersEmbedding)
booksVec = keras.layers.Flatten()(booksEmbedding)

dotProd = keras.layers.dot([usersVec, booksVec], axes=1, normalize=True, name='DotProduct')



concatVecs = keras.layers.Concatenate()([dotProd, userBias, bookBias, authorsInput, yearOfPubInput])

d1 = keras.layers.Dense(128, activation='relu')(concatVecs)
d1_drop = keras.layers.Dropout(0.2, name='Dropout')(d1)

d2 = keras.layers.Dense(1)(d1)
d2_drop = keras.layers.Dropout(0.2, name='Dropout')(d2)

neuralNetModel = keras.Model(inputs=[usersInput, booksInput, authorsInput, yearOfPubInput], outputs=d2)


if SGD_OPTIMIZER:
    optimizer = SGD(SGD_lr, SGD_momentum, SGD_decay, SGD_nesterov)
else:
    optimizer = Adam(Adam_lr, Adam_beta_1, Adam_beta_2, Adam_epsilon, Adam_decay, Adam_amsgrad)

neuralNetModel.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['mse', 'acc'])

neuralNetModel.summary()

In [None]:
plot_model(neuralNetModel, to_file='neuralNetModel.png', show_shapes=True)
SVG(model_to_dot(neuralNetModel).create(prog='dot', format='svg'))

In [None]:
callbacks = [keras.callbacks.EarlyStopping('val_loss', patience=3), 
             keras.callbacks.ModelCheckpoint('neuralNetModelWeights.h5', save_best_only=True)]

trainingHistory = neuralNetModel.fit([trainDataFrame.userIDX, 
                                      trainDataFrame.bookIDX, 
                                      trainDataFrame.authorIDX, 
                                      trainDataFrame.yearOfPublication], 
                                     trainDataFrame.impressionRating, 
                                     validation_split=.1, 
                                     callbacks=callbacks,
                                     batch_size=NN_BATCH_SIZE,
                                     epochs=NN_TRAIN_EPOCHS, 
                                     verbose=TRAIN_VERBOSE)

In [None]:
pd.Series(trainingHistory.history)

In [None]:
pd.Series(trainingHistory.history['loss']).plot(logy=True, label="Train Loss")
pd.Series(trainingHistory.history['val_loss']).plot(logy=True, label="Val Loss")
plt.legend(loc='upper right')
plt.xlabel("Epoch")
plt.ylabel("Training Error")

In [None]:
# Make some preditions
neuralNetRatingsPredictions = neuralNetModel.predict([testDataFrame.userIDX, 
                                                      testDataFrame.bookIDX])
neuralNetRatingsPredictions = np.rint(neuralNetRatingsPredictions)

In [None]:
displaySideBySide(pd.DataFrame(neuralNetRatingsPredictions).head(30), 
                  pd.DataFrame(actualImpressionRatings).head(30))

#### ----------Model 4: Neural Net Implementation (adding more depth)-------------

In [None]:
# Neural Net Implementation

booksInput = keras.layers.Input(shape=[1], name='Books')
booksEmbedding = keras.layers.Embedding(numOfBooks + 1, 
                                         NN_MOVIE_LATENT_FACTORS,
                                        embeddings_regularizer=l2(1e-4),
                                         name='BooksEmbedding')(booksInput)
booksVec = keras.layers.Flatten(name='FlattenBooks')(booksEmbedding)
booksVec = keras.layers.Dropout(0.2)(booksVec)


usersInput = keras.layers.Input(shape=[1], name='Users')
usersEmbedding = keras.layers.Embedding(numOfUsers + 1, 
                                        NN_USER_LATENT_FACTORS, 
                                        embeddings_regularizer=l2(1e-4),
                                        name='UsesEmbedding')(usersInput)
usersVec = keras.layers.Flatten(name='FlattenUsers')(usersEmbedding)
usersVec = keras.layers.Dropout(0.2)(usersVec)

concatLayer = keras.layers.concatenate([booksVec, usersVec], axis=1, name='ConcatLayer')
concatDropout = keras.layers.Dropout(0.2)(concatLayer)

denseLayer_1 = keras.layers.Dense(200, name='FullyConnected-0')(concatLayer)
dropoutLayer_1 = keras.layers.Dropout(0.2, name='Dropout')(denseLayer_1)

denseLayer_2 = keras.layers.Dense(100, name='FullyConnected-1')(concatLayer)
dropoutLayer_2 = keras.layers.Dropout(0.2, name='Dropout')(denseLayer_2)

denseLayer_3 = keras.layers.Dense(50, name='FullyConnected-2')(denseLayer_2)
dropoutLayer_3 = keras.layers.Dropout(0.2, name='Dropout')(denseLayer_3)

denseLayer_4 = keras.layers.Dense(20, name='FullyConnected-3', activation='relu')(denseLayer_3)
result = keras.layers.Dense(1, activation='relu', name='Activation')(denseLayer_4)

if SGD_OPTIMIZER:
    optimizer = SGD(SGD_lr, SGD_momentum, SGD_decay, SGD_nesterov)
else:
    optimizer = Adam(Adam_lr, Adam_beta_1, Adam_beta_2, Adam_epsilon, Adam_decay, Adam_amsgrad)

neuralNetModel = keras.Model([usersInput, booksInput], result)
neuralNetModel.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['mse', 'acc'])

neuralNetModel.summary()

In [None]:
plot_model(neuralNetModel, to_file='neuralNetModel.png', show_shapes=True)
SVG(model_to_dot(neuralNetModel).create(prog='dot', format='svg'))

In [None]:
callbacks = [keras.callbacks.EarlyStopping('val_loss', patience=3), 
             keras.callbacks.ModelCheckpoint('neuralNetModelWeights.h5', save_best_only=True)]

trainingHistory = neuralNetModel.fit([trainDataFrame.userIDX, trainDataFrame.bookIDX], 
                                     trainDataFrame.impressionRating, 
                                     validation_split=.1, 
                                     callbacks=callbacks,
                                     batch_size=NN_BATCH_SIZE,
                                     epochs=NN_TRAIN_EPOCHS, 
                                     verbose=TRAIN_VERBOSE)

In [None]:
pd.Series(trainingHistory.history)

In [None]:
pd.Series(trainingHistory.history['loss']).plot(logy=True, label="Train Loss")
pd.Series(trainingHistory.history['val_loss']).plot(logy=True, label="Val Loss")
plt.legend(loc='upper right')
plt.xlabel("Epoch")
plt.ylabel("Training Error")

In [None]:
# Make some preditions
neuralNetRatingsPredictions = neuralNetModel.predict([testDataFrame.userIDX, 
                                                      testDataFrame.bookIDX])
neuralNetRatingsPredictions = np.rint(neuralNetRatingsPredictions)

In [None]:
displaySideBySide(pd.DataFrame(neuralNetRatingsPredictions).head(30), 
                  pd.DataFrame(actualImpressionRatings).head(30))

In [None]:
# How did we do?
print("MAE {0:2.2f}".format(mean_absolute_error(actualImpressionRatings, neuralNetRatingsPredictions)))
print("RMSE {0:2.2f}".format(sqrt(mean_squared_error(actualImpressionRatings, neuralNetRatingsPredictions))))

#### ---------END OF NOTEBOOK--------------