A Non-linear Colloborative Filtering (MF) implementation in Keras. Problem structure is that of Movie Lens Recommender System. User and Item embeddings are mapped to non-linear random feature maps, and linear functional is used to drive the signal.

1)Plain CF Keras implementation
http://www.fenris.org/2016/03/07/index-html

2) CF and its extension
https://github.com/bradleypallen/keras-movielens-cf

3) Analysis of Random Binning
http://www.kdd.org/kdd2016/papers/files/rfp0942-wuA.pdf

4) Nueral Non-negative Matrix Factorization
https://arxiv.org/pdf/1511.06443.pdf

5) SVIF extension of Nueral Non-negative Matrix Factorization
https://www.cs.toronto.edu/~jstolee/projects/matrix_factorization_neural.pdf

6) Nueral Factorization Machine
https://arxiv.org/pdf/1708.05027.pdf
https://github.com/hexiangnan/neural_factorization_machine

7) Random Kitcen Sinks
https://people.eecs.berkeley.edu/~brecht/papers/08.rah.rec.nips.pdf

8) Extreme Machine Learning with Single Layer FeedForward Network (SLFN)
http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.217.5697&rep=rep1&type=pdf

In [1]:
%matplotlib inline
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from math import pi

from surprise import Dataset


from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding, Reshape, Merge, Concatenate
from keras import backend as K
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils


from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn import datasets

from scipy import stats

Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
# get a list of (user_id,item_id,rating, time_stamp)
data = Dataset.load_builtin('ml-100k')
# convert into into a pd data frame
df = pd.DataFrame(data.raw_ratings, columns=['userid', 'movieid','rating','timestamp'])
# create unique id for user and movie
df['userid'] = pd.to_numeric(df['userid'])
df['movieid'] = pd.to_numeric(df['movieid'])
df['user_emb_id'] = df['userid'] - 1
df['movie_emb_id'] = df['movieid'] - 1
df['rating'] = pd.to_numeric(df['rating'])
df.head()

Unnamed: 0,userid,movieid,rating,timestamp,user_emb_id,movie_emb_id
0,196,242,3.0,881250949,195,241
1,186,302,3.0,891717742,185,301
2,22,377,1.0,878887116,21,376
3,244,51,2.0,880606923,243,50
4,166,346,1.0,886397596,165,345


In [3]:
K_FACTORS = 100
RNG_SEED = 1446557
D_RKS = 1000

max_userid = df['userid'].drop_duplicates().max()
max_movieid = df['movieid'].drop_duplicates().max()
print(str(len(df))+ ' ratings loaded.')
print('# of users: ' + str(max_userid))
print('# of items: ' + str(max_movieid))
max_emb_userid = df['user_emb_id'].drop_duplicates().max()
max_emb_movieid = df['movie_emb_id'].drop_duplicates().max()
print('# of users: ' + str(max_emb_userid))
print('# of items: ' + str(max_emb_movieid))
print(df['movieid'].max())

100000 ratings loaded.
# of users: 943
# of items: 1682
# of users: 942
# of items: 1681
1682


In [4]:
shuffled_df = df.sample(frac=1., random_state=RNG_SEED)
Users = shuffled_df['user_emb_id'].values
print('Users shape ='+ str(Users.shape))
Movies = shuffled_df['movie_emb_id'].values
print('Movies shape ='+ str(Movies.shape))
Ratings = shuffled_df['rating'].values
print('Ratings shape ='+str(df.shape))
print(Users[:5])
print(Movies[:5])
print(Ratings[:5])

Users shape =(100000,)
Movies shape =(100000,)
Ratings shape =(100000, 6)
[304 902 928 902  57]
[ 732 1069  134   88  432]
[ 3.  4.  5.  4.  5.]


In [5]:
# baseline
from surprise import SVD
from surprise import evaluate, print_perf
# We'll use the famous SVD algorithm.
data.split(n_folds=2)
algo = SVD()
# Evaluate performances of our algorithm on the dataset.
perf = evaluate(algo, data, measures=['RMSE', 'MAE'])
print_perf(perf)

Evaluating RMSE, MAE of algorithm SVD.

------------
Fold 1
RMSE: 0.9563
MAE:  0.7574
------------
Fold 2
RMSE: 0.9582
MAE:  0.7559
------------
------------
Mean RMSE: 0.9572
Mean MAE : 0.7566
------------
------------
        Fold 1  Fold 2  Mean    
RMSE    0.9563  0.9582  0.9572  
MAE     0.7574  0.7559  0.7566  


In [12]:
# define a Sequential model from scratch
n_users = max_userid
k_factors = 100
m_items = max_movieid
d_rks = 1000

# define a custom activaiton function
def cos_rks(x):
    return K.cos(x)

# define weights
from math import pi
user_b1 = np.random.uniform(low=0.0, high=pi, size=[d_rks,])
user_w1 = np.random.normal(loc=0.0, scale=1.0,size=[k_factors,d_rks])
user_weights = [user_w1,user_b1]

item_b1 = np.random.uniform(low=0.0, high=pi, size=[d_rks,])
item_w1 = np.random.normal(loc=0.0, scale=1.0,size=[k_factors,d_rks])
item_weights = [item_w1,item_b1]

from keras.layers.merge import concatenate, dot
from keras.layers import Embedding, Input, Flatten
from keras.models import Model
from keras.layers.core import Dense

# user features
user_input = Input(shape=(1,), dtype='int32')
user_embed = Embedding(input_dim=n_users, output_dim=k_factors, input_length=1, trainable=True)(user_input)
#user_reshape = Reshape((k_factors,))
user_rks = Dense(d_rks, input_dim=k_factors,activation=cos_rks,trainable=False,weights=user_weights)(user_embed)
#user_rks.set_weights(user_weights)
user_flat = Flatten()(user_rks)

# item features
item_input = Input(shape=(1,), dtype='int32')
item_embed = Embedding(input_dim=m_items, output_dim=k_factors, input_length=1, trainable=True)(item_input)
#item_reshape = Reshape((k_factors,))
item_rks = Dense(d_rks, input_dim=k_factors,activation=cos_rks,trainable=False,weights=user_weights)(item_embed)
#user_rks.set_weights(item_weights)
item_flat = Flatten()(item_rks)

# Merge
merged = concatenate([user_flat, item_flat])

# Dense
dense_out = Dense(1, activation='linear')(merged)

# build and compile model
model = Model(inputs=[user_input, item_input], outputs=[dense_out])
model.compile(loss='mse', optimizer='adam',metrics=['mae'])
model.summary()


dot_out = dot([user_flat, item_flat],axes=1,normalize=True)
# build and compile model
model_dot = Model(inputs=[user_input, item_input], outputs=[dot_out])
model_dot.compile(loss='mse', optimizer='adam',metrics=['mae'])
model_dot.summary()


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_5 (Embedding)         (None, 1, 100)       94300       input_5[0][0]                    
__________________________________________________________________________________________________
embedding_6 (Embedding)         (None, 1, 100)       168200      input_6[0][0]                    
__________________________________________________________________________________________________
dense_7 (D

In [13]:
# fit custom model
model.fit([Users, Movies], Ratings, epochs=5,verbose=2,batch_size=20)
model.predict([np.array([196]), np.array([242])])[0][0]
model_dot.fit([Users, Movies], Ratings, epochs=5,verbose=2,batch_size=20)
model_dot.predict([np.array([196]), np.array([242])])[0][0]


Epoch 1/5
 - 25s - loss: 1.0926 - mean_absolute_error: 0.8275
Epoch 2/5
 - 25s - loss: 1.0338 - mean_absolute_error: 0.8064
Epoch 3/5
 - 24s - loss: 1.0286 - mean_absolute_error: 0.8035
Epoch 4/5
 - 22s - loss: 1.0407 - mean_absolute_error: 0.8070
Epoch 5/5
 - 23s - loss: 1.0435 - mean_absolute_error: 0.8079
Epoch 1/5
 - 33s - loss: 8.2989 - mean_absolute_error: 2.6508
Epoch 2/5
 - 33s - loss: 7.6878 - mean_absolute_error: 2.5347
Epoch 3/5
 - 32s - loss: 7.6847 - mean_absolute_error: 2.5336
Epoch 4/5
 - 32s - loss: 7.6833 - mean_absolute_error: 2.5333
Epoch 5/5
 - 32s - loss: 7.6826 - mean_absolute_error: 2.5331


0.99840218

In [14]:
print(Users[:5])
print(Movies[:5])
print(Ratings[:5])
model.predict([np.array([304,902,928,902,57]), np.array([732,1069,134,88,432])])

[304 902 928 902  57]
[ 732 1069  134   88  432]
[ 3.  4.  5.  4.  5.]


array([[ 3.9414866 ],
       [ 3.93495464],
       [ 3.93985033],
       [ 3.95518446],
       [ 3.95229721]], dtype=float32)

In [17]:
# serialize model to JSON
from keras.models import model_from_json

model_json = model.to_json()
with open("CF_model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("CF_model.h5")
print("Saved model to disk")
 

 # load json and create model
json_file = open('CF_model.json', 'r')
loaded_model = model_from_json(json_file.read(), custom_objects={'cos_rks': cos_rks})
json_file.close()

# load weights into new model
loaded_model.load_weights("CF_model.h5")
print("Loaded model from disk")

loaded_model.compile(loss='mse', optimizer='adam',metrics=['mae'])
loaded_model.summary()
loaded_model.fit([Users, Movies], Ratings, epochs=5,verbose=2,batch_size=20)
loaded_model.predict([np.array([196]), np.array([242])])[0][0]

Saved model to disk
Loaded model from disk
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_5 (Embedding)         (None, 1, 100)       94300       input_5[0][0]                    
__________________________________________________________________________________________________
embedding_6 (Embedding)         (None, 1, 100)       168200      input_6[0][0]                    
__________________________________________________________________

2.037231