In [1]:
import numpy as np
import pandas as pd

In [2]:
ratings = pd.read_csv('data/ratings.csv')

In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [4]:
len(ratings)

100004

## Cross-tab

Do a small cross-tab based on the users and movies with more ratings

In [5]:
user_groups = ratings.groupby('userId')['rating'].count()
top_users = user_groups.sort_values(ascending=False)[:15]

In [6]:
top_users.head()

userId
547    2391
564    1868
624    1735
15     1700
73     1610
Name: rating, dtype: int64

Get the movies with more ratings

In [7]:
movie_groups = ratings.groupby('movieId')['rating'].count()
top_movies = movie_groups.sort_values(ascending=False)[:15]

In [8]:
top_movies.head()

movieId
356    341
296    324
318    311
593    304
260    291
Name: rating, dtype: int64

In [9]:
top_r = ratings[ (ratings.userId.isin(top_users.index.tolist())) & (ratings.movieId.isin(top_movies.index.tolist())) ]

In [10]:
len(top_r)

206

In [11]:
pd.crosstab(top_r.userId, top_r.movieId, top_r.rating, aggfunc=np.sum)

movieId,1,110,260,296,318,356,480,527,589,593,608,1196,1198,1270,2571
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
15,2.0,3.0,5.0,5.0,2.0,1.0,3.0,4.0,4.0,5.0,5.0,5.0,4.0,5.0,5.0
30,4.0,5.0,4.0,5.0,5.0,5.0,4.0,5.0,4.0,4.0,5.0,4.0,5.0,5.0,3.0
73,5.0,4.0,4.5,5.0,5.0,5.0,4.0,5.0,3.0,4.5,4.0,5.0,5.0,5.0,4.5
212,3.0,5.0,4.0,4.0,4.5,4.0,3.0,5.0,3.0,4.0,,,3.0,3.0,5.0
213,3.0,2.5,5.0,,,2.0,5.0,,4.0,2.5,2.0,5.0,3.0,3.0,4.0
294,4.0,3.0,4.0,,3.0,4.0,4.0,4.0,3.0,,,4.0,4.5,4.0,4.5
311,3.0,3.0,4.0,3.0,4.5,5.0,4.5,5.0,4.5,2.0,4.0,3.0,4.5,4.5,4.0
380,4.0,5.0,4.0,5.0,4.0,5.0,4.0,,4.0,5.0,4.0,4.0,,3.0,5.0
452,3.5,4.0,4.0,5.0,5.0,4.0,5.0,4.0,4.0,5.0,5.0,4.0,4.0,4.0,2.0
468,4.0,3.0,3.5,3.5,3.5,3.0,2.5,,,3.0,4.0,3.0,3.5,3.0,3.0


In [12]:
# small sanity check
ratings[(ratings.userId == 15) & (ratings.movieId == 1)]

Unnamed: 0,userId,movieId,rating,timestamp
962,15,1,2.0,997938310


## Simple Model

We build a very simple model that does the dot product and bias of the (embedings of the) user ID and movie ID.

Because we are going to use Embeding layers, we need to rescale the `userId` and `movieId` so they are contiguous integers starting on 0.

In [13]:
users = ratings.userId.unique()
movies = ratings.movieId.unique()

In [14]:
userid2idx = {o:i for i,o in enumerate(users)}
movieid2idx = {o:i for i,o in enumerate(movies)}

In [15]:
ratings.movieId = ratings.movieId.apply(lambda x: movieid2idx[x])
ratings.userId = ratings.userId.apply(lambda x: userid2idx[x])

In [16]:
len(ratings.userId.unique()), ratings.userId.min(), ratings.userId.max()

(671, 0, 670)

In [17]:
len(ratings.movieId.unique()), ratings.movieId.min(), ratings.movieId.max()

(9066, 0, 9065)

In [18]:
n_users = ratings.userId.nunique()
n_movies = ratings.movieId.nunique()

We split 80% for training and 20% for validation

In [19]:
msk = np.random.rand(len(ratings)) < 0.8
train = ratings[msk]
valid = ratings[~msk]

Now we actually create the model, we create a model with `50` latent factors

In [20]:
n_factors = 50

In [21]:
from tensorflow.contrib import keras
from tensorflow.contrib.keras import models
from tensorflow.contrib.keras import layers
from tensorflow.contrib.keras import regularizers
from tensorflow.contrib.keras import optimizers

In [22]:
def embedding_input(name, n_in, n_out, reg=1e-4):
    inp = layers.Input(shape=(1,), dtype='int64', name=name)
    regularizer = regularizers.l2(reg)
    emb = layers.Embedding(n_in, n_out, input_length=1, embeddings_regularizer=regularizer)(inp)
    emb = layers.Flatten()(emb)
    return inp, emb

In [23]:
def create_bias(n_in, inp):
    emb = layers.Embedding(n_in, 1, input_length=1)(inp)
    return layers.Flatten()(emb)

In [24]:
n_users, n_movies

(671, 9066)

In [25]:
user_inp, user_emb = embedding_input('user_in', n_users, n_factors)
movie_inp, movie_emb = embedding_input('movie_in', n_movies, n_factors)

In [26]:
user_bias = create_bias(n_users, user_inp)
movie_bias = create_bias(n_movies, movie_inp)

The actual model

In [27]:
x = layers.Dot(axes=1)([user_emb, movie_emb])
x = layers.Add()([x, user_bias])
x = layers.Add()([x, movie_bias])
model = models.Model([user_inp, movie_inp], x)

In [28]:
model.compile(optimizers.Adam(0.001), loss='mse')

In [29]:
model.fit([train.userId, train.movieId], train.rating, batch_size=64, epochs=10, validation_data=([valid.userId, valid.movieId], valid.rating), verbose=2)

Train on 79905 samples, validate on 20099 samples
Epoch 1/10
10s - loss: 31.9418 - val_loss: 10.6155
Epoch 2/10
11s - loss: 7.7500 - val_loss: 6.3083
Epoch 3/10
12s - loss: 5.1966 - val_loss: 4.8028
Epoch 4/10
14s - loss: 3.8877 - val_loss: 3.7807
Epoch 5/10
18s - loss: 2.9884 - val_loss: 3.0802
Epoch 6/10
26s - loss: 2.3889 - val_loss: 2.6179
Epoch 7/10
26s - loss: 2.0036 - val_loss: 2.3242
Epoch 8/10
23s - loss: 1.7534 - val_loss: 2.1326
Epoch 9/10
21s - loss: 1.5824 - val_loss: 1.9942
Epoch 10/10
21s - loss: 1.4565 - val_loss: 1.8919


<tensorflow.contrib.keras.python.keras.callbacks.History at 0x11d0b4160>

In [31]:
model.fit([train.userId, train.movieId], train.rating, batch_size=64, epochs=10, validation_data=([valid.userId, valid.movieId], valid.rating), verbose=2)

Train on 79905 samples, validate on 20099 samples
Epoch 1/10
7s - loss: 1.3576 - val_loss: 1.8056
Epoch 2/10
8s - loss: 1.2753 - val_loss: 1.7340
Epoch 3/10
8s - loss: 1.2040 - val_loss: 1.6651
Epoch 4/10
8s - loss: 1.1407 - val_loss: 1.6047
Epoch 5/10
9s - loss: 1.0826 - val_loss: 1.5482
Epoch 6/10
9s - loss: 1.0308 - val_loss: 1.4965
Epoch 7/10
9s - loss: 0.9815 - val_loss: 1.4488
Epoch 8/10
10s - loss: 0.9361 - val_loss: 1.4037
Epoch 9/10
11s - loss: 0.8948 - val_loss: 1.3614
Epoch 10/10
11s - loss: 0.8559 - val_loss: 1.3216


<tensorflow.contrib.keras.python.keras.callbacks.History at 0x11d38f080>

Predict the rating for the user of id 0 and movie of id 1

In [30]:
model.predict([np.array([0]), np.array([1])])

array([[ 2.25669503]], dtype=float32)