In [3]:
from datasets.movie_lens import MovieLensDataset

dataset = MovieLensDataset()

In [4]:
ratings = dataset.Ratings()

all_ratings = ratings.DataFrame()
unique_users = all_ratings[ratings.UserIdColumnName()].unique()
unique_movies = all_ratings[ratings.MovieIdColumnName()].unique()

user_count = unique_users.shape[0]
movie_count = unique_movies.shape[0]
rating_count = all_ratings.shape[0]

print("Total number of users:   ", user_count)
print("Total number of movies:  ", movie_count)
print("Total number of ratings: ", rating_count)

Total number of users:    283228
Total number of movies:   53889
Total number of ratings:  27753444


In [5]:
# Potential embeddings dimension based on the empirical ruleo of 4th root.
user_embeddings_dimension = round(pow(user_count, 1.0/4.0))
movie_embeddings_dimension = round(pow(movie_count, 1.0/4.0))

print("Potential user embeddings dimension:  ", user_embeddings_dimension)
print("Potential movie embeddings dimension: ", movie_embeddings_dimension)

Potential user embeddings dimension:   23
Potential movie embeddings dimension:  15


In [6]:
# Assuming that movie features have to come entirely from a model, 
# this gives the total number of unknown variables in the system.
system_unknowns = user_count*user_embeddings_dimension + movie_count*movie_embeddings_dimension

# Assuming we use 80% of the data for training, the number of equations we will have is:
system_equations = round(0.8*rating_count)

print("Potential number of unknown embeddings variables: ", system_unknowns)
print("Total number of equations in the system:          ", system_equations)
print("The ratio of equations to unknowns:               ", system_equations/system_unknowns)

Potential number of unknown embeddings variables:  7322579
Total number of equations in the system:           22202755
The ratio of equations to unknowns:                3.0320949763737612
