# Install & Import modules

In [0]:
!pip install keras



In [0]:
import os, codecs, gc
import pandas as pd
import codecs
import numpy as np
import matplotlib.pyplot as plt
import warnings
import keras
from keras import regularizers
from keras.layers import Input, Embedding, Flatten, merge, Dense, Dropout, Lambda, dot
from keras.models import Model
from keras.utils.vis_utils import model_to_dot
from keras.constraints import non_neg
from keras.callbacks import ModelCheckpoint, EarlyStopping, TerminateOnNaN
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')

In [0]:
# mount google my drive
from google.colab import drive
drive.mount('/content/drive')
input_dir = "drive/My Drive/input/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Load data-file
 Book-Crossing Dataset  
 http://www2.informatik.uni-freiburg.de/~cziegler/BX/

In [0]:
# make dataframe from user data
with codecs.open(input_dir + "BX-Users.csv", "r", "utf8", "ignore") as file:
    user = pd.read_csv(file, delimiter=";")

In [0]:
# make dataframe from items data
col_name = ["ISBN", "Title", "Author", "Year", "Publisher", "URL-S", "URL-M", "URL-L"]
with codecs.open(input_dir + "BX-Books.csv", "r", "utf8", "ignore") as file:
    item = pd.read_csv(file, delimiter=";", names=col_name, skiprows=1, converters={"Year" : str})

In [0]:
# make dataframe from rating data
with codecs.open(input_dir + "BX-Book-Ratings.csv", "r", "utf8", "ignore") as file:
    rating = pd.read_csv(file, delimiter=";")

# data cleaning

In [0]:
# join dataframe item & rating
rating_author = pd.merge(rating, item, how='left', on='ISBN')

In [0]:
rating_author.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Title,Author,Year,Publisher,URL-S,URL-M,URL-L
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
1,276726,0155061224,5,Rites of Passage,Judith Rae,2001,Heinle,http://images.amazon.com/images/P/0155061224.0...,http://images.amazon.com/images/P/0155061224.0...,http://images.amazon.com/images/P/0155061224.0...
2,276727,0446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,http://images.amazon.com/images/P/0446520802.0...,http://images.amazon.com/images/P/0446520802.0...,http://images.amazon.com/images/P/0446520802.0...
3,276729,052165615X,3,Help!: Level 1,Philip Prowse,1999,Cambridge University Press,http://images.amazon.com/images/P/052165615X.0...,http://images.amazon.com/images/P/052165615X.0...,http://images.amazon.com/images/P/052165615X.0...
4,276729,0521795028,6,The Amsterdam Connection : Level 4 (Cambridge ...,Sue Leather,2001,Cambridge University Press,http://images.amazon.com/images/P/0521795028.0...,http://images.amazon.com/images/P/0521795028.0...,http://images.amazon.com/images/P/0521795028.0...


In [0]:
# select user-ID, Author, Book-Rating, Year
rating_author = rating_author.iloc[:, [0, 4, 2, 5]]

# drop nan
rating_author.dropna(inplace=True)
rating_author = rating_author[rating_author.Year.str.contains(pat='\d', regex=True)].iloc[:, 0:3]

# arrange dataset

In [0]:
# calc rating by user and author 
data = rating_author.groupby(['User-ID', 'Author'])["Book-Rating"].agg(['mean']).reset_index()
data.sort_values(by=['User-ID', 'Author'], inplace=True)
data.columns = ["userID", "author", "raw_ratings"]

In [0]:
data.raw_ratings = data.raw_ratings.astype("int")

In [0]:
del user, item, rating, rating_author
gc.collect()

91

# make dataset for keras

In [0]:
# convert to category
data["user_category"] = data.userID.astype('category').cat.codes.values
data["author_category"] = data.author.astype('category').cat.codes.values

In [0]:
data.head()

Unnamed: 0,userID,author,raw_ratings,user_category,author_category
0,2,Mark P. O. Morford,0,0,64914
1,8,Adam Lebor,0,1,846
2,8,Amy Tan,0,1,3363
3,8,Ann Beattie,5,1,4388
4,8,Carlo D'Este,0,1,12614


In [0]:
# binning raw_ratings
data.raw_ratings = data.raw_ratings.apply(lambda x : 0 if x == 0 else (1 if x in [1,2,3,4]  else (2 if x in[5, 6, 7] else 3)))

In [0]:
X = data.drop(['userID', 'author', 'raw_ratings'], axis=1)
y = data.raw_ratings

# Define training function

In [0]:
def train_keras(model):
  k = 5
  for i in range(k):
    print("===========Round" + str(i) + " Start===========" )
    train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.1, random_state=i)
  
    model.fit([train_x.user_category, train_x.author_category],  train_y,  epochs=10, validation_split=0.2,
              callbacks=[mcheck, echeck, ncheck], verbose=1)
  
    model.evaluate([test_x.user_category, test_x.author_category], test_y, verbose=1)
    pred = model.predict([test_x.user_category, test_x.author_category])
  
    print(np.sqrt(mean_squared_error(test_y, pred)))

# make network for Keras MF & Training

In [0]:
n_users, n_author = len(data.user_category.unique()), len(data.author_category.unique())
n_latent_factors = 3

In [0]:
# define metrics
from keras import backend as K
def rmse(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1)) 

In [0]:
# author network
author_input = keras.layers.Input(shape=[1], name='author')
author_embedding = keras.layers.Embedding(n_author + 1, n_latent_factors, name='author-Embedding')(author_input)
author_vec = keras.layers.Flatten(name='flatten_author')(author_embedding)
author_vec = keras.layers.Dropout(0.2)(author_vec)

# user network
user_input = keras.layers.Input(shape=[1],name='User')
user_embedding = keras.layers.Embedding(n_author + 1, n_latent_factors, name='user-Embedding')(user_input)
user_vec = keras.layers.Flatten(name='flatten_users')(user_embedding)
user_vec = keras.layers.Dropout(0.2)(user_vec)

# concat author and user
concat_vec = keras.layers.concatenate([author_vec, user_vec], axis=-1)
concat_vec = keras.layers.Dropout(0.2)(concat_vec)

# full-connected
dense4 = keras.layers.Dense(4, name='FullyConnected1', activation='relu')(concat_vec)
result = keras.layers.Dense(1, activation='relu',name='Activation')(dense4)
model = keras.Model([user_input, author_input], result)
model.compile(optimizer='Adagrad', loss='mse', metrics=[rmse])

In [0]:
# define callback
mcheck = ModelCheckpoint(filepath="./recommend.h5", monitor='val_loss', save_best_only=True)
echeck = EarlyStopping(monitor='val_loss', patience=0, verbose=0, mode='auto')
ncheck = TerminateOnNaN()

In [0]:
train_keras(model)

Train on 583859 samples, validate on 145965 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
1.0977994338977937
Train on 583859 samples, validate on 145965 samples
Epoch 1/10
Epoch 2/10
1.0317214188837773
Train on 583859 samples, validate on 145965 samples
Epoch 1/10
Epoch 2/10
1.0109699795273923
Train on 583859 samples, validate on 145965 samples
Epoch 1/10
Epoch 2/10
0.9926018199357021
Train on 583859 samples, validate on 145965 samples
Epoch 1/10
Epoch 2/10
0.9806957998797429


# make network for Keras NMF & Training

In [0]:
from keras.constraints import non_neg

# author network
author_input = keras.layers.Input(shape=[1], name='author')
author_embedding = keras.layers.Embedding(n_author + 1, n_latent_factors, name='author-Embedding', embeddings_constraint=non_neg())(author_input)
author_vec = keras.layers.Flatten(name='flatten_author')(author_embedding)
author_vec = keras.layers.Dropout(0.2)(author_vec)

# user network
user_input = keras.layers.Input(shape=[1],name='User')
user_embedding = keras.layers.Embedding(n_author + 1, n_latent_factors, name='user-Embedding', embeddings_constraint=non_neg())(user_input)
user_vec = keras.layers.Flatten(name='flatten_users')(user_embedding)
user_vec = keras.layers.Dropout(0.2)(user_vec)

# concat author and user
concat_vec = keras.layers.concatenate([author_vec, user_vec], axis=-1)
concat_vec = keras.layers.Dropout(0.2)(concat_vec)

# full-connected
dense4 = keras.layers.Dense(4, name='FullyConnected1', activation='relu')(concat_vec)
result = keras.layers.Dense(1, activation='relu',name='Activation')(dense4)
model = keras.Model([user_input, author_input], result)
model.compile(optimizer='Adagrad', loss='mse', metrics=[rmse])

In [0]:
train_keras(model)

Train on 583859 samples, validate on 145965 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
0.43244895150866175
Train on 583859 samples, validate on 145965 samples
Epoch 1/10
Epoch 2/10
0.4096420640823176
Train on 583859 samples, validate on 145965 samples
Epoch 1/10
Epoch 2/10
0.40233200245252015
Train on 583859 samples, validate on 145965 samples
Epoch 1/10
Epoch 2/10
0.3974245879063985
Train on 583859 samples, validate on 145965 samples
Epoch 1/10
Epoch 2/10
0.3935369646725725
