In [1]:
from theano.sandbox import cuda

Using gpu device 0: Tesla K80 (CNMeM is disabled, cuDNN 5103)


In [2]:
%matplotlib inline
import utils; reload(utils)
from utils import *
from __future__ import division, print_function

Using Theano backend.


In [3]:
path = "data/ml-latest-small/"
model_path = path + 'models/'
if not os.path.exists(model_path): os.mkdir(model_path)
batch_size=64

In [4]:
# The folder that contains movie + user data that analysis will be performed on.
ratings = pd.read_csv(path+'ratings.csv')
# The first rows show that the data consists of 4 columns()
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [5]:
len(ratings)

100004

In [6]:
# This is a way to access the name of the movie, for visualizing purposes.
# names_movies = pd.read_csv(path+'movies.csv')
movie_names = pd.read_csv(path+'movies.csv').set_index('movieId')['title'].to_dict()

In [7]:
# There are 9125 movies
len(movie_names)

9125

In [8]:
# Unique users and movies are deciphered to ensure that unnecessary noise is reduced.
users = ratings.userId.unique()
# There are 671 unique users
print(len(users))
movies = ratings.movieId.unique()
# There are 9066 unique movies.
print(len(movies))

671
9066


In [9]:
# Each unique user and movie is enumerated and individual user:index key values are generated. 
userid2idx = {o:i for i,o in enumerate(users)}
movieid2idx = {o:i for i,o in enumerate(movies)}

In [10]:
# User & movie IDs are updated such that they are
ratings.movieId = ratings.movieId.apply(lambda x: movieid2idx[x])
ratings.userId = ratings.userId.apply(lambda x: userid2idx[x])

In [11]:
user_min, user_max, movie_min, movie_max = (ratings.userId.min(), ratings.userId.max(), ratings.movieId.min(), ratings.movieId.max())
# Below, it is confirmed that there are 671 unique users
# And there are 9066 movies.
user_min, user_max, movie_min, movie_max

(0, 670, 0, 9065)

In [12]:
n_users = ratings.userId.nunique()
n_movies = ratings.movieId.nunique()
n_users, n_movies

(671, 9066)

In [13]:
# Declaring and arbitrary number(50) that are the number of latent factors
# For creating each embedding.
n_factors = 50

In [14]:
# This allows for controlled randomness of the numbers.
np.random.seed = 42

In [15]:
# Data is split randomly between training and test data
msk = np.random.rand(len(ratings)) < 0.8
trn = ratings[msk]
# print(trn.head)
# 79866 training data
print(len(trn))
val = ratings[~msk]
# 20138 validation data
print(len(val))
# print(val.head)

79866
20138


In [16]:
# g grabs userId by rating and
g=ratings.groupby('userId')['rating'].count()
# then the top 15 users that watch the most movies.
topUsers=g.sort_values(ascending=False)[:15]
topUsers.head()

userId
546    2391
563    1868
623    1735
14     1700
72     1610
Name: rating, dtype: int64

In [17]:
# again, g is the variable that finds the movies that have been rated the most fequently
g=ratings.groupby('movieId')['rating'].count()
# These are the 15 most watched movies
topMovies=g.sort_values(ascending=False)[:15]
topMovies.head()

movieId
57     341
49     324
99     311
92     304
143    291
Name: rating, dtype: int64

In [18]:
top_r = ratings.join(topUsers, rsuffix='_r', how='inner', on='userId')
top_r.head()

Unnamed: 0,userId,movieId,rating,timestamp,rating_r
962,14,417,2.0,997938310,1700
963,14,650,2.0,1134521380,1700
964,14,651,4.5,1093070098,1700
965,14,652,4.0,1040205753,1700
966,14,20,3.0,1093028290,1700


In [19]:
top_r = top_r.join(topMovies, rsuffix='_r', how='inner', on='movieId')
top_r.head()

Unnamed: 0,userId,movieId,rating,timestamp,rating_r,rating_r.1
962,14,417,2.0,997938310,1700,247
5048,29,417,4.0,944943070,1011,247
10214,72,417,5.0,1303464840,1610,247
28390,211,417,3.0,1218405007,876,247
29266,212,417,3.0,1462637445,910,247


In [20]:
# The crosstab shows great insight, and also it shows that users 
# 14, 29 and 72 have watched an rated every single movie.
pd.crosstab(top_r.userId, top_r.movieId, top_r.rating, aggfunc=np.sum)

movieId,27,49,57,72,79,89,92,99,143,179,180,197,402,417,505
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
14,3.0,5.0,1.0,3.0,4.0,4.0,5.0,2.0,5.0,5.0,4.0,5.0,5.0,2.0,5.0
29,5.0,5.0,5.0,4.0,5.0,4.0,4.0,5.0,4.0,4.0,5.0,5.0,3.0,4.0,5.0
72,4.0,5.0,5.0,4.0,5.0,3.0,4.5,5.0,4.5,5.0,5.0,5.0,4.5,5.0,4.0
211,5.0,4.0,4.0,3.0,5.0,3.0,4.0,4.5,4.0,,3.0,3.0,5.0,3.0,
212,2.5,,2.0,5.0,,4.0,2.5,,5.0,5.0,3.0,3.0,4.0,3.0,2.0
293,3.0,,4.0,4.0,4.0,3.0,,3.0,4.0,4.0,4.5,4.0,4.5,4.0,
310,3.0,3.0,5.0,4.5,5.0,4.5,2.0,4.5,4.0,3.0,4.5,4.5,4.0,3.0,4.0
379,5.0,5.0,5.0,4.0,,4.0,5.0,4.0,4.0,4.0,,3.0,5.0,4.0,4.0
451,4.0,5.0,4.0,5.0,4.0,4.0,5.0,5.0,4.0,4.0,4.0,4.0,2.0,3.5,5.0
467,3.0,3.5,3.0,2.5,,,3.0,3.5,3.5,3.0,3.5,3.0,3.0,4.0,4.0


In [21]:
# The most basic model is a dotproduct of the user embedding & the movie embedding.
# user_in is the input vector of the user
user_in = Input(shape=(1,), dtype='int64', name='user_in')
# u is/are the embeddings for each user. So, the inputs to the Emberdding() function are
# Embedding(671, 50, input_lenght=1, W_regularizer=l2(le-4))(user_in)
u = Embedding(n_users, n_factors, input_length=1, W_regularizer=l2(1e-4))(user_in)
# The Embedding is a Tensor and is associated with Theano
print(type(u))

# The same as above, but this time for an input of each movie.
movie_in = Input(shape=(1,), dtype='int64', name='movie_in')
m = Embedding(n_movies, n_factors, input_length=1, W_regularizer=l2(1e-4))(movie_in)
# The Embedding is a Tensor and is associated with Theano and it is the movie embedding 
# That consists of 50 latent factors.
print(type(m))


<class 'theano.tensor.var.TensorVariable'>
<class 'theano.tensor.var.TensorVariable'>


In [22]:
# The dot product of the user and movie embeddings is calculated
x = merge([u, m], mode='dot')
# print(u)
# The dot product is then flattened into one contiguous vector
x = Flatten()(x)
# print(x)
# This is the model
model = Model([user_in, movie_in], x)
# A computational graph is created
model.compile(Adam(0.001), loss='mse')

# model.summary()

In [23]:
# The model is trained by provideing the userId & movieId as input and the raiting as the target valye
# It is ran for 6 epochs
model.fit([trn.userId, trn.movieId], trn.rating, batch_size=64, nb_epoch=6, 
          validation_data=([val.userId, val.movieId], val.rating))

Train on 79866 samples, validate on 20138 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x7fa39910fcd0>

In [24]:
model.optimizer.lr=0.01

In [25]:
model.fit([trn.userId, trn.movieId], trn.rating, batch_size=64, nb_epoch=3, 
          validation_data=([val.userId, val.movieId], val.rating))

Train on 79866 samples, validate on 20138 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fa3b8257d50>

In [26]:
model.optimizer.lr=0.001

In [27]:
# Since this data has been tested far and wide, the closest benchmark is at
# a loss of 0.9, but I am doing considerably poorly. The next step will
# be a step to decrease that loss value.
model.fit([trn.userId, trn.movieId], trn.rating, batch_size=64, nb_epoch=6, 
          validation_data=([val.userId, val.movieId], val.rating))

Train on 79866 samples, validate on 20138 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x7fa39910fdd0>

In [28]:
# In order to decrease the loss function further, biases are added for each movie
# and each user to represent how positive or negative each user is and how good a movie is.

# It is calculated by creating an embedding
def embedding_input(name, n_in, n_out, reg):
    inp = Input(shape=(1,), dtype='int64', name=name)
    return inp, Embedding(n_in, n_out, input_length=1, W_regularizer=l2(reg))(inp)

In [29]:
# This outputs u, the user embedding(u) and the user_in as calculated by the embedding_input() function
user_in, u = embedding_input('user_in', n_users, n_factors, 1e-4)
# This outputs u, the user embedding(u) and the user_in
movie_in, m = embedding_input('movie_in', n_movies, n_factors, 1e-4)

In [30]:
# A bias is created by creating an embedding with a single value
def create_bias(inp, n_in):
    x = Embedding(n_in, 1, input_length=1)(inp)
    return Flatten()(x)

In [31]:
ub = create_bias(user_in, n_users)
mb = create_bias(movie_in, n_movies)

In [32]:
# After adding bias, the new model needs to add
# the user bias and the movie bias to improve accuracy.
x = merge([u, m], mode='dot')
x = Flatten()(x)
x = merge([x, ub], mode='sum')
x = merge([x, mb], mode='sum')
model = Model([user_in, movie_in], x)
model.compile(Adam(0.001), loss='mse')

In [33]:
# Achieved state of the art according to http://www.librec.net/example.html
model.fit([trn.userId, trn.movieId], trn.rating, batch_size=64, nb_epoch=6, 
          validation_data=([val.userId, val.movieId], val.rating))

Train on 79866 samples, validate on 20138 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x7fa397488350>

In [34]:
model.optimizer.lr=0.01

In [35]:
model.fit([trn.userId, trn.movieId], trn.rating, batch_size=64, nb_epoch=6, 
          validation_data=([val.userId, val.movieId], val.rating))

Train on 79866 samples, validate on 20138 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x7fa39910fd90>

In [39]:
# Neural Network using the Keras Functional API

user_in, u = embedding_input('user_in', n_users, n_factors, 1e-5)
movie_in, m = embedding_input('movie_in', n_movies, n_factors, 1e-5)

x = merge([u, m], mode='concat')
x = Flatten()(x)
# Dense layer
x = Dense(100, activation='relu')(x)
# Adding Dropout
x = Dropout(0.5)(x)
# Activation function 
x = Dense(1)(x)

model = Model([user_in, movie_in], x)
model.compile(Adam(0.001), loss='mse')

In [40]:
# Just 6 epochs shatter the state of the art achieved by Stanford Researchers(0.89 loss). I achieved (0.8090 loss)
model.fit([trn.userId, trn.movieId], trn.rating, batch_size=64, nb_epoch=5, validation_data=([val.userId, val.movieId], val.rating))

Train on 79866 samples, validate on 20138 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fa394d89190>