In [29]:
from collections import defaultdict
import os

from scipy import sparse
from tqdm import tqdm
import numpy as np
import pandas as pd


def load_ratings(filename):
    dirpath = '../data/ml-latest-small'
    ratings = pd.read_csv(os.path.join(dirpath, filename))
    return ratings


def get_user_movie_dictionary(dataframe):
    users = dataframe.userId.unique()
    movies = dataframe.movieId.unique()

    user2idx = {user: idx for idx, user in enumerate(users)}
    movie2idx = {movie: idx for idx, movie in enumerate(movies)}

    return user2idx, movie2idx


def transform_binary_matrix(dataframe, user2idx, movie2idx):
    rows = list()
    cols = list()
    data = list()

    stat = defaultdict(int)

    for user, movie, rating in zip(
            dataframe['userId'], dataframe['movieId'], dataframe['rating']):
        user_idx = user2idx[user]
        movie_idx = movie2idx[movie]

        rows.append(user_idx)
        cols.append(movie_idx)
        if rating >= 2.0:
            data.append(1.0)
            stat['pos'] += 1
        else:
            data.append(-1.0)
            stat['neg'] += 1

    matrix = sparse.csr_matrix(
        (data, (rows, cols)),
        shape=(len(user2idx), len(movie2idx))
    )
    return matrix, stat


def split_matrix(original, user2idx, movie2idx):
    np.random.seed(2020)

    N_user = original.shape[0]
    N_movie = original.shape[1]

    rows_tr = list()
    cols_tr = list()
    data_tr = list()

    rows_val = list()
    cols_val = list()
    data_val = list()

    for rdx, cdx in tqdm(zip(*original.nonzero())):
        rated_movie = len(original[rdx, :].nonzero()[1])
        rated_user = len(original[:, cdx].nonzero()[0])

        threshold = (rated_movie / N_movie) * (rated_user / N_user) + 0.8
        random_number = np.random.rand()
        if random_number <= threshold:
            rows_tr.append(rdx)
            cols_tr.append(cdx)
            data_tr.append(original[rdx, cdx])
        else:
            rows_val.append(rdx)
            cols_val.append(cdx)
            data_val.append(original[rdx, cdx])

    train_matrix = sparse.csr_matrix(
        (data_tr, (rows_tr, cols_tr)), shape=(len(user2idx), len(movie2idx))
    )
    validation_matrix = sparse.csr_matrix(
        (data_val, (rows_val, cols_val)), shape=(len(user2idx), len(movie2idx))
    )

    return train_matrix, validation_matrix

In [30]:
rating_df = load_ratings('ratings.csv')
user2idx, movie2idx = get_user_movie_dictionary(rating_df)
print(f'# of user: {len(user2idx)}\t# of movie: {len(movie2idx)}')

rating_matrix, stat =\
    transform_binary_matrix(rating_df, user2idx, movie2idx)
print(
    f'Positive Feedback: {stat["pos"]}',
    f'\tNegative Feedback: {stat["neg"]}'
)

rating_matrix_train, rating_matrix_val =\
    split_matrix(rating_matrix, user2idx, movie2idx)

print(
    f'Train: {rating_matrix_train.count_nonzero()}\t',
    f'Validation Size: {rating_matrix_val.count_nonzero()}'
)

0it [00:00, ?it/s]# of user: 610	# of movie: 9724
Positive Feedback: 94864 	Negative Feedback: 5972
100836it [00:52, 1904.60it/s]Train: 81216	 Validation Size: 19620



In [31]:
recommendations_mf = np.load('../output/rec_mf.npz')
recommendations_auto = np.load('../output/rec_auto.npz')

In [32]:
recommendations_mf = recommendations_mf['arr_0']
recommendations_auto = recommendations_auto['arr_0']

In [33]:
recommendations_mf.shape

(610, 100)

In [34]:
recommendations_auto.shape

(610, 100)

In [35]:
dirpath = '../data/ml-latest-small'
filename = 'movies.csv'
movies = pd.read_csv(os.path.join(dirpath, filename))
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [44]:
[movies.loc[movies.movieId == idx2movie[idx], 'title'] for idx in rating_matrix_val[1, :].nonzero()[1] if rating_matrix_val[1, idx] == 1]

[291    Tommy Boy (1995)
 Name: title, dtype: object,
 6710    Dark Knight, The (2008)
 Name: title, dtype: object,
 6801    Step Brothers (2008)
 Name: title, dtype: object,
 7323    Exit Through the Gift Shop (2010)
 Name: title, dtype: object,
 7372    Inception (2010)
 Name: title, dtype: object,
 8550    Ex Machina (2015)
 Name: title, dtype: object]

In [45]:
[movies.loc[movies.movieId == idx2movie[idx], 'title'] for idx in recommendations_mf[1, :10]]

[7372    Inception (2010)
 Name: title, dtype: object,
 2226    Fight Club (1999)
 Name: title, dtype: object,
 257    Pulp Fiction (1994)
 Name: title, dtype: object,
 6710    Dark Knight, The (2008)
 Name: title, dtype: object,
 510    Silence of the Lambs, The (1991)
 Name: title, dtype: object,
 314    Forrest Gump (1994)
 Name: title, dtype: object,
 46    Usual Suspects, The (1995)
 Name: title, dtype: object,
 4800    Lord of the Rings: The Return of the King, The...
 Name: title, dtype: object,
 1939    Matrix, The (1999)
 Name: title, dtype: object,
 4137    Lord of the Rings: The Two Towers, The (2002)
 Name: title, dtype: object]

In [46]:
[movies.loc[movies.movieId == idx2movie[idx], 'title'] for idx in recommendations_auto[1, :10]]

[7802    Intouchables (2011)
 Name: title, dtype: object,
 6258    Pursuit of Happyness, The (2006)
 Name: title, dtype: object,
 6743    Iron Man (2008)
 Name: title, dtype: object,
 6710    Dark Knight, The (2008)
 Name: title, dtype: object,
 7466    King's Speech, The (2010)
 Name: title, dtype: object,
 2907    Almost Famous (2000)
 Name: title, dtype: object,
 4427    Pirates of the Caribbean: The Curse of the Bla...
 Name: title, dtype: object,
 8569    The Imitation Game (2014)
 Name: title, dtype: object,
 6405    Ratatouille (2007)
 Name: title, dtype: object,
 7039    Up (2009)
 Name: title, dtype: object]