In [1]:
from theano.sandbox import cuda

In [1]:
%matplotlib inline
import utils; reload(utils)
from utils import *
from __future__ import division, print_function
from keras.layers import Embedding

Using TensorFlow backend.


In [2]:
#path = "data/ml-20m/"
path = "/mnt/data/ml-latest-small/"
model_path = '/mnt/models/ml-latest-small/'
if not os.path.exists(model_path): os.mkdir(model_path)
batch_size=64

## Set up data

We're working with the movielens data, which contains one rating per row, like this:

In [23]:
ratings = pd.read_csv(path+'ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [24]:
movie_names = pd.read_csv(path + 'movies.csv')
movie_names.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [25]:
movies = ratings.movieId.unique()
users = ratings.userId.unique()
movie2idx = {movie_id:idx for idx, movie_id in enumerate(movies)}
user2idx = {user_id:idx for idx, user_id in enumerate(users)}

In [26]:
ratings['new_user_id'] = ratings.userId.apply(lambda x: user2idx[x])
ratings['new_movie_id'] = ratings.movieId.apply(lambda x: movie2idx[x])

In [27]:
n_movies = ratings.movieId.nunique()
n_users = ratings.userId.nunique()

## Split into training and validation data

In [28]:
msk = (np.random.rand(len(ratings)) < 0.8)
trn = ratings[msk]
val = ratings[~msk]

## Config

In [29]:
n_factors = 50

## Fit Dot Product Model

In [11]:
movie_in = Input(shape=(1,), dtype='int64', name='movie_in')
user_in = Input(shape=(1,), dtype='int64', name='user_in')
movie_emb = Embedding(n_movies, n_factors, input_length=1, W_regularizer=l2(1e-4))(movie_in)
user_emb = Embedding(n_movies, n_factors, input_length=1, W_regularizer=l2(1e-4))(user_in)

In [12]:
x = merge([user_emb, movie_emb], mode='dot')
x = Flatten()(x)
model = Model([user_in, movie_in], x)
model.compile(optimizer=Adam(0.01), loss='mse')

In [49]:
model.fit([trn.new_user_id, trn.new_movie_id], trn.rating, nb_epoch=10, batch_size=64,
          validation_data=([val.new_user_id, val.new_movie_id], val.rating))

Train on 80186 samples, validate on 19818 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f9eec048fd0>

## Bias Model

In [34]:
user_in = Input(shape=(1,), dtype='int64', name='user_in')
user_emb = Embedding(input_dim=n_users, output_dim=n_factors, input_length=1, W_regularizer=l2(1e-4))(user_in)
movie_in = Input(shape=(1,), dtype='int64', name='movie_in')
movie_emb = Embedding(input_dim=n_movies, output_dim=n_factors, input_length=1, W_regularizer=l2(1e-4))(movie_in)

In [35]:
movie_bias = Embedding(input_dim=n_movies, output_dim=1, input_length=1)(movie_in)
user_bias = Embedding(input_dim=n_users, output_dim=1, input_length=1)(user_in)

In [36]:
x = merge([user_emb, movie_emb], mode='dot')
x = merge([x, user_bias], mode='sum')
x = merge([x, movie_bias], mode='sum')
x = Flatten()(x)

In [37]:
model = Model([user_in, movie_in], x)
model.compile(optimizer=Adam(0.001), loss='mse')

In [38]:
model.fit([trn.new_user_id, trn.new_movie_id], trn.rating, nb_epoch=1,
          batch_size=64, validation_data=([val.new_user_id, val.new_movie_id], val.rating))

Train on 80170 samples, validate on 19834 samples
Epoch 1/1


<keras.callbacks.History at 0x7fe9e27c2f10>

In [39]:
model.optimizer.lr = 0.01
model.fit([trn.new_user_id, trn.new_movie_id], trn.rating, nb_epoch=1,
          batch_size=64, validation_data=([val.new_user_id, val.new_movie_id], val.rating))

Train on 80170 samples, validate on 19834 samples
Epoch 1/1


<keras.callbacks.History at 0x7fea05ba8b50>

In [46]:
model.optimizer.lr = 0.001
model.fit([trn.new_user_id, trn.new_movie_id], trn.rating, nb_epoch=5,
          batch_size=64, validation_data=([val.new_user_id, val.new_movie_id], val.rating))

Train on 80170 samples, validate on 19834 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fe9e267ff50>

## Basic Neural Network 

In [66]:
user_in = Input(shape=(1,), dtype='int64', name='user_in')
user_emb = Embedding(n_users, n_factors, input_length=1)(user_in)
movie_in = Input(shape=(1,), dtype='int64', name='movie_in')
movie_emb = Embedding(n_movies, n_factors, input_length=1)(movie_in)

In [67]:
x1 = Dense(50, activation='relu')(user_emb)
x2 = Dense(50, activation='relu')(movie_emb)
x = merge([x1, x2], mode='concat')
x = Flatten()(x)
x = Dense(1, activation='relu')(x)

In [53]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
user_in (InputLayer)             (None, 1)             0                                            
____________________________________________________________________________________________________
movie_in (InputLayer)            (None, 1)             0                                            
____________________________________________________________________________________________________
embedding_19 (Embedding)         (None, 1, 50)         33550       user_in[0][0]                    
____________________________________________________________________________________________________
embedding_20 (Embedding)         (None, 1, 50)         453300      movie_in[0][0]                   
___________________________________________________________________________________________

In [68]:
model = Model([user_in, movie_in], x)
model.compile(optimizer=Adam(lr=0.001), loss='mse')

In [69]:
model.fit([trn.new_user_id, trn.new_movie_id], trn.rating, nb_epoch=1,
         batch_size=64, validation_data=([val.new_user_id, val.new_movie_id], val.rating))

Train on 80170 samples, validate on 19834 samples
Epoch 1/1


<keras.callbacks.History at 0x7fe9e18113d0>