Using dataset from movielens with 20 million movie ratings

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tensorflow.keras.layers import Input, Dense, Flatten, Concatenate, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import SGD, Adam
from sklearn.utils import shuffle
import wget

In [6]:
df = pd.read_csv("data/ratings.csv")
df.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580
5,1,112,3.5,1094785740
6,1,151,4.0,1094785734
7,1,223,4.0,1112485573
8,1,253,4.0,1112484940
9,1,260,4.0,1112484826


In [9]:
# we need to make sure that the user ideas are sequential integers
# starting from 0 to N-1
df["userId"] = pd.Categorical(df.userId)
# assigning codes to new user ids from 0 to M-1
df['new_user_id'] = df.userId.cat.codes

In [10]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,new_user_id
0,1,2,3.5,1112486027,0
1,1,29,3.5,1112484676,0
2,1,32,3.5,1112484819,0
3,1,47,3.5,1112484727,0
4,1,50,3.5,1112484580,0


In [11]:
# applying the same technique to movie ids
df["movieId"] = pd.Categorical(df.movieId)
df["new_movie_id"] = df.movieId.cat.codes

In [15]:
# getting the data into seperate arrays
user_ids = df["new_user_id"].values
movie_ids = df["new_movie_id"].values
ratings = df["rating"].values

In [17]:
# setting the number of users and number of movies
N = len(set(user_ids))
M = len(set(movie_ids))

# embedding dimensions
K = 16

print(f"Number of users: {N}")
print(f"Number of Movies: {M}")

Number of users: 138493
Number of Movies: 26744


# Model

In [21]:
# user input
u = Input(shape=(1,))

# movie input
m = Input(shape=(1,))

# user embedding Embedding(num_users, embedding_dim)(u)
u_emb = Embedding(N, K)(u) # output is (num_samples, 1, K)

# movie embedding 
m_emb = Embedding(M, K)(m)

# flatten embeddings
u_emb = Flatten()(u_emb) # shape would be (num_samples, K)
m_emb = Flatten()(m_emb)


x = Concatenate()([u_emb, m_emb])  #shape is (num_samples, 2K) whe concatenation
x = Dense(512, activation='relu')(x)
x = Dense(1)(x)

In [24]:
model = Model(inputs=[u, m], outputs=x)
model.compile(
    loss='mse',
    optimizer=SGD(lr=0.08, momentum=0.9),
    metrics=['accuracy']
)

## Train & Validation Data Split

In [26]:
# Shuffling the data before splitting
user_ids, movie_ids, ratings = shuffle(user_ids, movie_ids, ratings)
# splitting the data
N_train = int(0.8 * len(user_ids))

train_user, valid_user = user_ids[:N_train], user_ids[N_train:]
train_movie, valid_movie = movie_ids[:N_train], movie_ids[N_train:]
train_ratings, valid_ratings = ratings[:N_train], ratings[N_train:]

print(f"Shape of train_user is {train_user.shape}")
print(f"Shape of valid_user is {valid_user.shape}")


Shape of train_user is (16000210,)
Shape of valid_user is (4000053,)
