In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle

In [0]:
from tensorflow.keras.layers import Input, Embedding, Concatenate, Flatten, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import SGD

In [3]:
!wget -nc http://files.grouplens.org/datasets/movielens/ml-20m.zip

--2020-06-04 12:45:25--  http://files.grouplens.org/datasets/movielens/ml-20m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 198702078 (189M) [application/zip]
Saving to: ‘ml-20m.zip’


2020-06-04 12:45:38 (15.4 MB/s) - ‘ml-20m.zip’ saved [198702078/198702078]



In [4]:
!unzip -n ml-20m.zip

Archive:  ml-20m.zip
   creating: ml-20m/
  inflating: ml-20m/genome-scores.csv  
  inflating: ml-20m/genome-tags.csv  
  inflating: ml-20m/links.csv        
  inflating: ml-20m/movies.csv       
  inflating: ml-20m/ratings.csv      
  inflating: ml-20m/README.txt       
  inflating: ml-20m/tags.csv         


In [0]:
df = pd.read_csv('ml-20m/ratings.csv')

In [6]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [0]:
df.userId = pd.Categorical(df.userId)
df['new_user_id'] = df.userId.cat.codes

In [0]:
df.movieId = pd.Categorical(df.movieId)
df['new_movie_id'] = df.movieId.cat.codes

In [9]:
df

Unnamed: 0,userId,movieId,rating,timestamp,new_user_id,new_movie_id
0,1,2,3.5,1112486027,0,1
1,1,29,3.5,1112484676,0,28
2,1,32,3.5,1112484819,0,31
3,1,47,3.5,1112484727,0,46
4,1,50,3.5,1112484580,0,49
...,...,...,...,...,...,...
20000258,138493,68954,4.5,1258126920,138492,13754
20000259,138493,69526,4.5,1259865108,138492,13862
20000260,138493,69644,3.0,1260209457,138492,13875
20000261,138493,70286,5.0,1258126944,138492,13993


In [0]:
user_ids = df['new_user_id'].values
movie_ids = df['new_movie_id'].values
ratings = df['rating'].values

In [11]:
user_ids

array([     0,      0,      0, ..., 138492, 138492, 138492], dtype=int32)

In [12]:
N = len(set(user_ids))
M = len(set(movie_ids))

K = 10 #setting embedding diemnsion

N, M

(138493, 26744)

In [0]:
#User input
u = Input(shape=(1,))

#Movie input
m = Input(shape=(1,))

#user_embedding
u_emb = Embedding(N, K)(u)

#movie_embedding
m_emb = Embedding(M, K)(m)

#Flatten both embeddings
u_emb = Flatten()(u_emb)
m_emb = Flatten()(m_emb)

#Concatenate user-movie embeddings into a feature vector
x = Concatenate()([u_emb, m_emb])

x =  Dense(1024, activation='relu')(x)
x = Dense(1)(x)

model = Model(inputs=[u, m], outputs=x)

In [14]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 1, 10)        1384930     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 10)        267440      input_2[0][0]                    
______________________________________________________________________________________________

In [0]:
model.compile(loss='mse', optimizer=SGD(lr=0.08, momentum=0.9))

In [0]:
user_ids, movie_ids, ratings = shuffle(user_ids, movie_ids, ratings)

In [0]:
Ntrain = int(0.8*len(ratings))

In [0]:
train_user = user_ids[:Ntrain]
train_movie = movie_ids[:Ntrain]
train_ratings = ratings[:Ntrain]

test_user = user_ids[Ntrain:]
test_movie = movie_ids[Ntrain:]
test_ratings = ratings[Ntrain:]

In [0]:
# center the ratings
avg_rating = train_ratings.mean()
train_ratings = train_ratings - avg_rating
test_ratings = test_ratings - avg_rating

In [20]:
r = model.fit( x=[train_user, train_movie], y=train_ratings, epochs=25, batch_size=1024, verbose=2, validation_data=([test_user, test_movie], test_ratings))

Epoch 1/25
15626/15626 - 84s - loss: 0.7782 - val_loss: 0.7202
Epoch 2/25
15626/15626 - 86s - loss: 0.7011 - val_loss: 0.6968
Epoch 3/25
15626/15626 - 84s - loss: 0.6787 - val_loss: 0.6832
Epoch 4/25
15626/15626 - 84s - loss: 0.6632 - val_loss: 0.6750
Epoch 5/25
15626/15626 - 85s - loss: 0.6521 - val_loss: 0.6712
Epoch 6/25
15626/15626 - 84s - loss: 0.6393 - val_loss: 0.6584
Epoch 7/25
15626/15626 - 83s - loss: 0.6228 - val_loss: 0.6487
Epoch 8/25
15626/15626 - 84s - loss: 0.6096 - val_loss: 0.6426
Epoch 9/25
15626/15626 - 85s - loss: 0.5999 - val_loss: 0.6385
Epoch 10/25
15626/15626 - 83s - loss: 0.5909 - val_loss: 0.6362
Epoch 11/25
15626/15626 - 82s - loss: 0.5820 - val_loss: 0.6364
Epoch 12/25
15626/15626 - 83s - loss: 0.5738 - val_loss: 0.6319
Epoch 13/25
15626/15626 - 87s - loss: 0.5666 - val_loss: 0.6304
Epoch 14/25
15626/15626 - 85s - loss: 0.5602 - val_loss: 0.6253
Epoch 15/25
15626/15626 - 86s - loss: 0.5547 - val_loss: 0.6233
Epoch 16/25
15626/15626 - 87s - loss: 0.5503 - va

In [0]:
plt.plot(r.history['loss'], label='train_loss')
plt.plot(r.history['val_loss'], label='val_loss')
plt.legend()
plt.show()

In [21]:
np.sqrt(0.6244)

0.7901898506055364

In [22]:
avg_rating

3.5254539784165333