In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from keras.layers import Embedding, Reshape, Dropout, Dense, Dot, Concatenate, Input, Flatten
from keras.layers.merge import dot
from keras.models import Model, Sequential
from tensorflow.python.keras.regularizers import l2
from keras.optimizers import Adamax, Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint


Using TensorFlow backend.


### Model

In [2]:
class CFModel(Sequential):

    def __init__(self, n_users, m_items, k_factors, **kwargs):
        P = Sequential()
        P.add(Embedding(n_users, k_factors, input_length=1))
        P.add(Reshape((k_factors,)))
        Q = Sequential()
        Q.add(Embedding(m_items, k_factors, input_length=1))
        Q.add(Reshape((k_factors,)))
        super(CFModel, self).__init__(**kwargs)
        self.add(Dot(axes=1)([P, Q]))
        #merged = dot([P.output, Q.output], axes=1)
        #self.add(merged)
        

    def rate(self, user_id, item_id):
        return self.predict([np.array([user_id]), np.array([item_id])])[0][0]


class DeepModel(Sequential):

    def __init__(self, n_users, m_items, k_factors, p_dropout=0.1, **kwargs):
        P = Sequential()
        P.add(Embedding(n_users, k_factors, input_length=1))
        P.add(Reshape((k_factors,)))
        Q = Sequential()
        Q.add(Embedding(m_items, k_factors, input_length=1))
        Q.add(Reshape((k_factors,)))
        super(DeepModel, self).__init__(**kwargs)
        self.add(Concatenate([P, Q]))
        self.add(Dropout(p_dropout))
        self.add(Dense(k_factors, activation='relu'))
        self.add(Dropout(p_dropout))
        self.add(Dense(1, activation='linear'))

    def rate(self, user_id, item_id):
        return self.predict([np.array([user_id]), np.array([item_id])])[0][0]

### Load data

In [3]:
K_FACTORS = 120
RNG_SEED = 1446557

ratings = pd.read_csv('foo2.csv')
ratings.drop(['index'],axis=1, inplace=True)
ratings.head()

Unnamed: 0,userid,movieid,rating
0,44,1,4
1,61,1,3
2,67,1,4
3,72,1,3
4,86,1,5


In [4]:
from sklearn.preprocessing import LabelEncoder
user_enc = LabelEncoder()
movie_enc = LabelEncoder()
user_enc.fit(ratings.userid.unique())
movie_enc.fit(ratings.movieid.unique())
n_users = len(user_enc.classes_)
n_movies = len(movie_enc.classes_)

In [5]:
ratings.userid = user_enc.transform(ratings.userid)
ratings.movieid = movie_enc.transform(ratings.movieid)
ratings.head()

Unnamed: 0,userid,movieid,rating
0,43,0,4
1,60,0,3
2,66,0,4
3,71,0,3
4,85,0,5


In [6]:
shuffled_ratings = ratings.sample(frac=1., random_state=RNG_SEED)
Users = shuffled_ratings['userid'].values
print('Users:', Users, ', shape =', Users.shape)
Movies = shuffled_ratings['movieid'].values
print('Movies:', Movies, ', shape =', Movies.shape)
Ratings = shuffled_ratings['rating'].values
print('Ratings:', Ratings, ', shape =', Ratings.shape)

Users: [5118 8984 4603 ... 1635 9523 4863] , shape = (1176952,)
Movies: [754 465 620 ... 910 819 105] , shape = (1176952,)
Ratings: [5 2 5 ... 4 4 3] , shape = (1176952,)


### Define model

### this box for trying out different things

In [17]:
# seems that we have to use keras functional api instead of model api
user_in = Input(shape=(1,), dtype='int64', name='user_in')
u = Embedding(10000, K_FACTORS, input_length=1, embeddings_regularizer=None, embeddings_initializer='glorot_normal')(user_in)
print(u)
u = Reshape((K_FACTORS,1,), name='reshape_users')(u)
movie_in = Input(shape=(1,), dtype='int64', name='movie_in')
m = Embedding(1000, K_FACTORS, input_length=1, embeddings_regularizer=None, embeddings_initializer='glorot_normal')(movie_in)
print(m)
m = Reshape((K_FACTORS,1,), name='reshape_movies')(m)
x = Dot(axes=1)([u, m])
print(x)
x = Flatten(name='flatten')(x)
model = Model([user_in, movie_in], x)
model.compile(Adam(0.000001), loss='mse')
#model.compile(loss='mse', optimizer='adamax')

Tensor("embedding_11/embedding_lookup:0", shape=(?, 1, 120), dtype=float32)
Tensor("embedding_12/embedding_lookup:0", shape=(?, 1, 120), dtype=float32)
Tensor("dot_6/MatMul:0", shape=(?, 1, 1), dtype=float32)


In [18]:
#max_userid = ratings['userid'].drop_duplicates().max()+1
#max_movieid = ratings['movieid'].drop_duplicates().max()+1

#model = CFModel(max_userid, max_movieid, K_FACTORS)
#model.compile(loss='mse', optimizer='adamax')

### Train model

In [19]:
callbacks = [EarlyStopping('val_loss', patience=2), 
             ModelCheckpoint('model_weights.h5', save_best_only=True)]
history = model.fit([Users, Movies], Ratings, nb_epoch=30, validation_split=.2, verbose=2, callbacks=callbacks)

  This is separate from the ipykernel package so we can avoid doing imports until


Train on 1059256 samples, validate on 117696 samples
Epoch 1/30


KeyboardInterrupt: 

### Plot training and validation RMSE

In [None]:
#loss = pd.DataFrame({'epoch': [ i + 1 for i in history.epoch ],
 #                    'training': [ math.sqrt(loss) for loss in history.history['loss'] ],
 #                    'validation': [ math.sqrt(loss) for loss in history.history['val_loss'] ]})
#ax = loss.ix[:,:].plot(x='epoch', figsize={7,10}, grid=True)
#ax.set_ylabel("root mean squared error")
#ax.set_ylim([0.0,3.0]);

### Print best validation RMSE

In [None]:
min_val_loss, idx = min((val, idx) for (idx, val) in enumerate(history.history['val_loss']))
print 'Minimum RMSE at epoch', '{:d}'.format(idx+1), '=', '{:.4f}'.format(math.sqrt(min_val_loss))