## Import necessary modules

In [1]:
import pandas as pd
import numpy as np
import helpers
from tensorflow.python.keras.layers import Input, Embedding, Flatten, Dot
from tensorflow.python.keras.layers import Reshape, Add, Concatenate, Dense, Dropout
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.regularizers import l2
from tensorflow.python.keras.optimizers import Adam

In [2]:
#train_path = helpers.get_train_file_path()
ratings = pd.read_csv('foo4.csv')# helpers.get_test_file_path())
ratings.head()

Unnamed: 0,userId,movieId,rating
0,44,1,4
1,61,1,3
2,67,1,4
3,72,1,3
4,86,1,5


In [39]:
len(ratings)

1176952

In [40]:
n_users = 10000+1#ratings.userId.nunique()
print(n_users)

10001


In [41]:
n_movies = 1000+1#ratings.movieId.nunique()
print(n_movies)

1001


In [42]:
g = ratings.groupby('userId')['rating'].count()
print(g[:10])

userId
1      23
2     149
3      86
4      20
5     134
6     169
7      62
8      26
9     178
10    107
Name: rating, dtype: int64


In [43]:
topUsers = g.sort_values(ascending=False)[:15]
print(topUsers)

userId
5512    522
9711    487
8706    484
966     470
9377    458
8575    452
1000    438
1878    437
9827    433
4600    433
2038    429
1830    427
1570    425
7014    418
5289    417
Name: rating, dtype: int64


In [44]:
g = ratings.groupby('movieId')['rating'].count()
topMovies = g.sort_values(ascending=False)[:15]
top_r = ratings.join(topUsers, rsuffix='_r', how='inner', on='userId')
print(top_r[:25])
#rating_r column is the number of movies that user with userId has rated

       userId  movieId  rating  rating_r
37        966        1       5       470
1944      966        4       5       470
4938      966        5       5       470
6484      966        6       5       470
10520     966        7       4       470
14999     966        9       5       470
18173     966       11       5       470
19304     966       12       2       470
19808     966       13       5       470
20569     966       14       5       470
24336     966       15       5       470
26540     966       17       2       470
30279     966       21       5       470
33417     966       23       5       470
34113     966       24       5       470
37845     966       27       5       470
39089     966       29       5       470
39970     966       30       5       470
40708     966       31       3       470
41146     966       32       5       470
43105     966       33       5       470
45244     966       35       5       470
53481     966       45       5       470
60453     966   

In [45]:
top_r = top_r.join(topMovies, rsuffix='_r', how='inner', on='movieId')
print(top_r[:20])
# second rating_r is the total number of users that ratied the movie with id movieId

        userId  movieId  rating  rating_r  rating_r
6484       966        6       5       470      4347
6741      1570        6       5       425      4347
6853      1830        6       5       427      4347
6945      2038        6       5       429      4347
8334      5289        6       5       417      4347
9821      8706        6       5       484      4347
10137     9377        6       5       458      4347
10343     9827        6       5       433      4347
9763      8575        6       5       452      4347
20569      966       14       5       470      3986
20925     1878       14       5       437      3986
20988     2038       14       5       429      3986
22317     5512       14       5       522      3986
22939     7014       14       5       418      3986
23649     8706       14       5       484      3986
23940     9377       14       5       458      3986
20584     1000       14       5       438      3986
24076     9711       14       5       487      3986
205495     9

In [46]:
pd.crosstab(top_r.userId, top_r.movieId, top_r.rating, aggfunc=np.sum)

movieId,6,14,46,60,134,156,178,256,471,495,594,596,608,668,978
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
966,5.0,5.0,,,,,5.0,5.0,5.0,5.0,5.0,,,5.0,5.0
1000,,5.0,5.0,,5.0,,5.0,,5.0,,5.0,5.0,,5.0,3.0
1570,5.0,,,5.0,5.0,,,,,5.0,,5.0,,,5.0
1830,5.0,,5.0,5.0,,,,5.0,,5.0,,5.0,5.0,,
1878,,5.0,5.0,,5.0,,5.0,5.0,5.0,,,,,5.0,
2038,5.0,5.0,,5.0,5.0,,,,,,5.0,,,5.0,
4600,,,,,5.0,5.0,5.0,5.0,5.0,5.0,,5.0,,,5.0
5289,5.0,,5.0,5.0,5.0,5.0,5.0,5.0,5.0,,5.0,,5.0,5.0,
5512,,5.0,5.0,,,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,,5.0
7014,,5.0,5.0,5.0,5.0,5.0,,,,5.0,,,,,


In [47]:
# split train and validation
msk = np.random.rand(len(ratings)) < 0.8
train = ratings[msk]
valid = ratings[~msk]
print(len(train), len(valid))

942087 234865


## Dot Product

In [48]:
print(n_users)
print(n_movies)

10001
1001


In [49]:
n_factors = 128

### Embeddings

In [50]:
user_in = Input(shape=(1,), dtype='int64', name='user_in')
u = Embedding(n_users, n_factors, input_length=1, embeddings_regularizer=None, embeddings_initializer='glorot_normal')(user_in)
u = Reshape((n_factors,))(u)
movie_in = Input(shape=(1,), dtype='int64', name='movie_in')
m = Embedding(n_movies, n_factors, input_length=1, embeddings_regularizer=None, embeddings_initializer='glorot_normal')(movie_in)
m = Reshape((n_factors,))(m)

In [51]:
x = Dot(axes=(1))([u, m])
x = Flatten()(x)
model = Model([user_in, movie_in], x)
model.compile(Adam(0.000001), loss='mse')

In [52]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_in (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
movie_in (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 1, 128)       1280128     user_in[0][0]                    
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 1, 128)       128128      movie_in[0][0]                   
__________________________________________________________________________________________________
reshape_3 

In [53]:
batch_size=64

In [54]:
print(type(train.userId))
print(type(train.userId.values))

<class 'pandas.core.series.Series'>
<class 'numpy.ndarray'>


In [55]:
def fit_model(epochs=1):
    for i in range(epochs):
        model.fit(
            [train.userId.values, train.movieId.values], train.rating.values, batch_size=batch_size,
            validation_data=([valid.userId.values, valid.movieId.values], valid.rating.values))

In [56]:
fit_model()

Train on 942087 samples, validate on 234865 samples
Epoch 1/1



In [None]:
fit_model(10)

In [None]:
model.optimizer.lr = 0.001

In [None]:
fit_model(10)

## Dot product with bias

In [57]:
def create_bias(inp, n_in):
    x = Embedding(n_in, 1, input_length=1)(inp)
    return Flatten()(x)

In [58]:
user_in = Input(shape=(1,), dtype='int64', name='user_in')
u = Embedding(n_users, n_factors, input_length=1, embeddings_regularizer=l2(1e-8), embeddings_initializer='glorot_normal')(user_in)
u = Reshape((n_factors,))(u)
movie_in = Input(shape=(1,), dtype='int64', name='movie_in')
m = Embedding(n_movies, n_factors, input_length=1, embeddings_regularizer=l2(1e-8), embeddings_initializer='glorot_normal')(movie_in)
m = Reshape((n_factors,))(m)

Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [59]:
ub = create_bias(user_in, n_users)
mb = create_bias(movie_in, n_movies)

In [60]:
x = Dot(axes=(1))([u, m])
x = Flatten()(x)
x = Add()([x, ub])
x = Add()([x, mb])
model = Model([user_in, movie_in], x)

In [61]:
model.compile(Adam(0.000001), loss='mse')

In [62]:
fit_model(5)

Train on 942087 samples, validate on 234865 samples
Epoch 1/1

Train on 942087 samples, validate on 234865 samples
Epoch 1/1
 12224/942087 [..............................] 12224/942087 [..............................] - ETA: 12:07 - loss: 9.5501 E

KeyboardInterrupt: 

In [63]:
model.optimizer.lr = 0.001

In [None]:
fit_model(5)

## Deep Neural Network Approach

In [64]:
user_in = Input(shape=(1,), dtype='int64', name='user_in')
u = Embedding(n_users, n_factors, input_length=1, embeddings_regularizer=l2(1e-8), embeddings_initializer='glorot_normal')(user_in)
u = Reshape((n_factors,))(u)
movie_in = Input(shape=(1,), dtype='int64', name='movie_in')
m = Embedding(n_movies, n_factors, input_length=1, embeddings_regularizer=l2(1e-8), embeddings_initializer='glorot_normal')(movie_in)
m = Reshape((n_factors,))(m)

In [65]:
x = Concatenate()([u, m])
x = Flatten()(x)
x = Dense(100, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(1)(x)
model = Model([user_in, movie_in], x)
model.compile(Adam(0.001), loss='mse')

In [66]:
fit_model(1)

Train on 942087 samples, validate on 234865 samples
Epoch 1/1



In [None]:
fit_model(5)