In [1]:
from theano.sandbox import cuda

In [2]:
%matplotlib inline
import utils; reload(utils)
from utils import *
from __future__ import division, print_function
from keras.layers import Embedding

Using TensorFlow backend.


In [3]:
#path = "data/ml-20m/"
path = "/mnt/data/ml-latest-small/"
model_path = '/mnt/models/ml-latest-small/'
if not os.path.exists(model_path): os.mkdir(model_path)
batch_size=64

## Set up data

We're working with the movielens data, which contains one rating per row, like this:

In [4]:
ratings = pd.read_csv(path+'ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [5]:
movie_names = pd.read_csv(path + 'movies.csv')
movie_names = movie_names.set_index('movieId')
movie_names.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [6]:
movies = ratings.movieId.unique()
users = ratings.userId.unique()
movie2idx = {movie_id:idx for idx, movie_id in enumerate(movies)}
user2idx = {user_id:idx for idx, user_id in enumerate(users)}

In [7]:
ratings['new_user_id'] = ratings.userId.apply(lambda x: user2idx[x])
ratings['new_movie_id'] = ratings.movieId.apply(lambda x: movie2idx[x])

In [8]:
n_movies = ratings.movieId.nunique()
n_users = ratings.userId.nunique()

## Split into training and validation data

In [9]:
msk = (np.random.rand(len(ratings)) < 0.8)
trn = ratings[msk]
val = ratings[~msk]

## Config

In [9]:
n_factors = 50

## Fit Dot Product Model

In [61]:
movie_in = Input(shape=(1,), dtype='int64', name='movie_in')
user_in = Input(shape=(1,), dtype='int64', name='user_in')
movie_emb = Embedding(n_movies, n_factors, input_length=1, W_regularizer=l2(1e-4))(movie_in)
user_emb = Embedding(n_movies, n_factors, input_length=1, W_regularizer=l2(1e-4))(user_in)

In [62]:
x = merge([user_emb, movie_emb], mode='dot')
x = Flatten()(x)
model = Model([user_in, movie_in], x)
model.compile(optimizer=Adam(0.01), loss='mse')

In [49]:
model.fit([trn.new_user_id, trn.new_movie_id], trn.rating, nb_epoch=10, batch_size=64,
          validation_data=([val.new_user_id, val.new_movie_id], val.rating))

Train on 80186 samples, validate on 19818 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f9eec048fd0>

## Bias Model

In [81]:
user_in = Input(shape=(1,), dtype='int64', name='user_in')
user_emb = Embedding(input_dim=n_users, output_dim=n_factors, input_length=1, W_regularizer=l2(1e-4))(user_in)
movie_in = Input(shape=(1,), dtype='int64', name='movie_in')
movie_emb = Embedding(input_dim=n_movies, output_dim=n_factors, input_length=1, W_regularizer=l2(1e-4))(movie_in)

In [82]:
movie_bias = Embedding(input_dim=n_movies, output_dim=1, input_length=1)(movie_in)
user_bias = Embedding(input_dim=n_users, output_dim=1, input_length=1)(user_in)

In [83]:
x = merge([user_emb, movie_emb], mode='dot')
x = merge([x, user_bias], mode='sum')
x = merge([x, movie_bias], mode='sum')
x = Flatten()(x)

In [84]:
model = Model([user_in, movie_in], x)
model.compile(optimizer=Adam(0.001), loss='mse')

In [85]:
model.fit([trn.new_user_id, trn.new_movie_id], trn.rating, nb_epoch=1,
          batch_size=64, validation_data=([val.new_user_id, val.new_movie_id], val.rating))

Train on 80196 samples, validate on 19808 samples
Epoch 1/1


<keras.callbacks.History at 0x7f5f5c70fd90>

In [86]:
model.optimizer.lr = 0.01
model.fit([trn.new_user_id, trn.new_movie_id], trn.rating, nb_epoch=1,
          batch_size=64, validation_data=([val.new_user_id, val.new_movie_id], val.rating))

Train on 80196 samples, validate on 19808 samples
Epoch 1/1


<keras.callbacks.History at 0x7f5f683e0350>

In [87]:
model.optimizer.lr = 0.001
model.fit([trn.new_user_id, trn.new_movie_id], trn.rating, nb_epoch=5,
          batch_size=64, validation_data=([val.new_user_id, val.new_movie_id], val.rating))

Train on 80196 samples, validate on 19808 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f5f5c646850>

## Basic Neural Network 

In [115]:
user_in = Input(shape=(1,), dtype='int64', name='user_in')
user_emb = Embedding(n_users, n_factors, input_length=1)(user_in)
movie_in = Input(shape=(1,), dtype='int64', name='movie_in')
movie_emb = Embedding(n_movies, n_factors, input_length=1)(movie_in)

In [116]:
x1 = Dense(50, activation='relu')(user_emb)
x2 = Dense(50, activation='relu')(movie_emb)
x = merge([x1, x2], mode='concat')
x = Flatten()(x)
x = Dense(1, activation='relu')(x)

In [120]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
user_in (InputLayer)             (None, 1)             0                                            
____________________________________________________________________________________________________
movie_in (InputLayer)            (None, 1)             0                                            
____________________________________________________________________________________________________
embedding_24 (Embedding)         (None, 1, 32)         21472       user_in[0][0]                    
____________________________________________________________________________________________________
embedding_25 (Embedding)         (None, 1, 32)         290112      movie_in[0][0]                   
___________________________________________________________________________________________

In [118]:
model = Model([user_in, movie_in], x)
model.compile(optimizer=Adam(lr=0.001), loss='mse')

In [119]:
model.fit([trn.new_user_id, trn.new_movie_id], trn.rating, nb_epoch=1,
         batch_size=64, validation_data=([val.new_user_id, val.new_movie_id], val.rating))

Train on 79868 samples, validate on 20136 samples
Epoch 1/1


<keras.callbacks.History at 0x7fe4e3f91350>

## Analyze Results

In [88]:
g = ratings.groupby(['new_movie_id'])['rating'].count()
g = g.sort_values(ascending=False)
top_movies = g.index[:2000].values

In [89]:
model = Model(movie_in, movie_bias)
movie_bias = model.predict(top_movies)
movie_bias = np.squeeze(movie_pop)

In [97]:
movie_ratings = [(mb, movie_names.loc[movies[movie_idx]].title) for mb, movie_idx in zip(movie_bias, top_movies)]

In [100]:
sorted(movie_ratings, key=itemgetter(0))[:100]

[(-0.1884041, '6th Day, The (2000)'),
 (-0.18208164, 'Airheads (1994)'),
 (-0.17474726, 'Boys on the Side (1995)'),
 (-0.16939932, 'Sleeper (1973)'),
 (-0.14775528, 'Willow (1988)'),
 (-0.13248335, 'Hero (1992)'),
 (-0.12714814, 'Lock, Stock & Two Smoking Barrels (1998)'),
 (-0.11726499, 'Evil Dead, The (1981)'),
 (-0.10472398, 'Man with the Golden Gun, The (1974)'),
 (-0.098918885, 'Beautiful Girls (1996)'),
 (-0.094652072, 'Santa Clause, The (1994)'),
 (-0.076515056, 'Harry Potter and the Chamber of Secrets (2002)'),
 (-0.075517446, 'Devil in a Blue Dress (1995)'),
 (-0.060368951, 'Men in Black (a.k.a. MIB) (1997)'),
 (-0.054589927, 'Road Warrior, The (Mad Max 2) (1981)'),
 (-0.046583485, 'Go (1999)'),
 (-0.046527948, 'Romeo and Juliet (1968)'),
 (-0.045749452,
  "Wes Craven's New Nightmare (Nightmare on Elm Street Part 7: Freddy's Finale, A) (1994)"),
 (-0.042307891, 'Tropic Thunder (2008)'),
 (-0.039903451, 'Guardians of the Galaxy (2014)'),
 (-0.039056458, 'Kick-Ass (2010)'),
 (-0

In [101]:
sorted(movie_ratings, key=itemgetter(0), reverse=True)[:100]

[(0.49692023, 'Zero Effect (1998)'),
 (0.49681184, '21 Jump Street (2012)'),
 (0.48738471, 'Ref, The (1994)'),
 (0.47501162, 'Thank You for Smoking (2006)'),
 (0.45782319, 'Pitch Black (2000)'),
 (0.44987682, 'Green Mile, The (1999)'),
 (0.44281125,
  'Laputa: Castle in the Sky (Tenk\xc3\xbb no shiro Rapyuta) (1986)'),
 (0.44086549, 'It Happened One Night (1934)'),
 (0.43999392, 'Old Boy (2003)'),
 (0.43588305, 'Walk the Line (2005)'),
 (0.43144017, 'Paths of Glory (1957)'),
 (0.43070376, 'Maltese Falcon, The (1941)'),
 (0.4289608,
  'Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964)'),
 (0.42529488, 'Harold and Maude (1971)'),
 (0.42335135, 'Major League (1989)'),
 (0.42151135, 'Gravity (2013)'),
 (0.42046589, 'Young Guns (1988)'),
 (0.42014745, 'Great Escape, The (1963)'),
 (0.41951522, 'Nutty Professor, The (1963)'),
 (0.41891679, "City Slickers II: The Legend of Curly's Gold (1994)"),
 (0.41255155, 'Major Payne (1995)'),
 (0.40908408, 'My Cousin Vinny (199

In [102]:
from sklearn.decomposition import PCA

In [108]:
model = Model(movie_in, movie_emb)
mm_emb = np.squeeze(model.predict(top_movies))

In [115]:
pca = PCA(n_components=3)
pca.fit(mm_emb)

PCA(copy=True, n_components=3, whiten=False)

In [116]:
pca.explained_variance_ratio_

array([ 0.3248,  0.2935,  0.1665], dtype=float32)

In [110]:
mm_emb.shape

(2000, 50)

In [30]:
trn.rating.head()

0    2.5
1    3.0
2    3.0
3    2.0
4    4.0
Name: rating, dtype: float64