In [2]:
from collections import defaultdict
import os

from scipy import sparse
from tqdm import tqdm
import numpy as np
import pandas as pd


def load_ratings(filename):
    dirpath = '../data/ml-latest-small'
    ratings = pd.read_csv(os.path.join(dirpath, filename))
    return ratings


def get_user_movie_dictionary(dataframe):
    users = dataframe.userId.unique()
    movies = dataframe.movieId.unique()

    user2idx = {user: idx for idx, user in enumerate(users)}
    movie2idx = {movie: idx for idx, movie in enumerate(movies)}

    return user2idx, movie2idx


def transform_binary_matrix(dataframe, user2idx, movie2idx):
    rows = list()
    cols = list()
    data = list()

    stat = defaultdict(int)

    for user, movie, rating in zip(
            dataframe['userId'], dataframe['movieId'], dataframe['rating']):
        user_idx = user2idx[user]
        movie_idx = movie2idx[movie]

        rows.append(user_idx)
        cols.append(movie_idx)
        if rating >= 2.0:
            data.append(1.0)
            stat['pos'] += 1
        else:
            data.append(-1.0)
            stat['neg'] += 1

    matrix = sparse.csr_matrix(
        (data, (rows, cols)),
        shape=(len(user2idx), len(movie2idx))
    )
    return matrix, stat


def split_matrix(original, user2idx, movie2idx):
    np.random.seed(2020)

    N_user = original.shape[0]
    N_movie = original.shape[1]

    rows_tr = list()
    cols_tr = list()
    data_tr = list()

    rows_val = list()
    cols_val = list()
    data_val = list()

    for rdx, cdx in tqdm(zip(*original.nonzero())):
        rated_movie = len(original[rdx, :].nonzero()[1])
        rated_user = len(original[:, cdx].nonzero()[0])

        threshold = (rated_movie / N_movie) * (rated_user / N_user) + 0.8
        random_number = np.random.rand()
        if random_number <= threshold:
            rows_tr.append(rdx)
            cols_tr.append(cdx)
            data_tr.append(original[rdx, cdx])
        else:
            rows_val.append(rdx)
            cols_val.append(cdx)
            data_val.append(original[rdx, cdx])

    train_matrix = sparse.csr_matrix(
        (data_tr, (rows_tr, cols_tr)), shape=(len(user2idx), len(movie2idx))
    )
    validation_matrix = sparse.csr_matrix(
        (data_val, (rows_val, cols_val)), shape=(len(user2idx), len(movie2idx))
    )

    return train_matrix, validation_matrix

In [3]:
rating_df = load_ratings('ratings.csv')
user2idx, movie2idx = get_user_movie_dictionary(rating_df)
print(f'# of user: {len(user2idx)}\t# of movie: {len(movie2idx)}')

rating_matrix, stat =\
    transform_binary_matrix(rating_df, user2idx, movie2idx)
print(
    f'Positive Feedback: {stat["pos"]}',
    f'\tNegative Feedback: {stat["neg"]}'
)

rating_matrix_train, rating_matrix_val =\
    split_matrix(rating_matrix, user2idx, movie2idx)

print(
    f'Train: {rating_matrix_train.count_nonzero()}\t',
    f'Validation Size: {rating_matrix_val.count_nonzero()}'
)

0it [00:00, ?it/s]# of user: 610	# of movie: 9724
Positive Feedback: 94864 	Negative Feedback: 5972
100836it [00:51, 1961.18it/s]Train: 81216	 Validation Size: 19620



In [4]:
embedding = np.load('../output/embedding.npz')
embedding = embedding['arr_0']
embedding.shape

(9724, 100)

In [5]:
dirpath = '../data/ml-latest-small'
filename = 'movies.csv'
movies = pd.read_csv(os.path.join(dirpath, filename))
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [6]:
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
item_similarity = cosine_similarity(embedding)

In [8]:
top_similar_items = np.argsort(item_similarity, axis=-1)[:, ::-1][:, :11]

In [9]:
idx2movie = {idx: movie for movie, idx in movie2idx.items()}

In [10]:
[(movies.loc[movies.movieId == idx2movie[idx], 'title'], item_similarity[0, idx]) for idx in top_similar_items[0, :]]

[(0    Toy Story (1995)
  Name: title, dtype: object,
  1.0),
 (295    What's Eating Gilbert Grape (1993)
  Name: title, dtype: object,
  0.9924701),
 (418    Jurassic Park (1993)
  Name: title, dtype: object,
  0.9918151),
 (31    Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
  Name: title, dtype: object,
  0.9909825),
 (322    Lion King, The (1994)
  Name: title, dtype: object,
  0.9904566),
 (615    Independence Day (a.k.a. ID4) (1996)
  Name: title, dtype: object,
  0.990292),
 (577    Truth About Cats & Dogs, The (1996)
  Name: title, dtype: object,
  0.9902755),
 (43    Seven (a.k.a. Se7en) (1995)
  Name: title, dtype: object,
  0.99021757),
 (592    Rock, The (1996)
  Name: title, dtype: object,
  0.99011785),
 (314    Forrest Gump (1994)
  Name: title, dtype: object,
  0.9900517),
 (506    Aladdin (1992)
  Name: title, dtype: object,
  0.9897231)]

In [11]:
sample_movies = [88125, 122912, 122892, 122922] # Harry Poter, Inifinity War, Age of Ultron, Dr Strange
[movie2idx[movie] for movie in sample_movies]

[1938, 2134, 1365, 1082]

In [12]:
movie_vec = embedding[1938] + embedding[2134] - embedding[1365]
movie_vec = movie_vec.reshape(1, -1)
movie_vec.shape

(1, 100)

In [13]:
movie_similariry = cosine_similarity(movie_vec, embedding)
similar_movies = np.argsort(movie_similariry[0, :], axis=-1)[::-1]
[(movies.loc[movies.movieId == idx2movie[idx], 'title'], movie_similariry[0, idx]) for idx in similar_movies[:50]]

[(7644    Harry Potter and the Deathly Hallows: Part 2 (...
  Name: title, dtype: object,
  0.98372984),
 (8693    Avengers: Infinity War - Part I (2018)
  Name: title, dtype: object,
  0.97471595),
 (2226    Fight Club (1999)
  Name: title, dtype: object,
  0.97328526),
 (705    Citizen Kane (1941)
  Name: title, dtype: object,
  0.97305995),
 (684    Breakfast at Tiffany's (1961)
  Name: title, dtype: object,
  0.9725243),
 (7955    Skyfall (2012)
  Name: title, dtype: object,
  0.97247815),
 (2020    Run Lola Run (Lola rennt) (1998)
  Name: title, dtype: object,
  0.9716353),
 (8546    Big Hero 6 (2014)
  Name: title, dtype: object,
  0.9715785),
 (929    Raging Bull (1980)
  Name: title, dtype: object,
  0.9714977),
 (6772    WALL·E (2008)
  Name: title, dtype: object,
  0.9714916),
 (2558    Do the Right Thing (1989)
  Name: title, dtype: object,
  0.9714886),
 (907    Clockwork Orange, A (1971)
  Name: title, dtype: object,
  0.97141594),
 (918    Ran (1985)
  Name: title, dtype:

In [14]:
# Dr. Strange
movie_similariry[0, 1082]

0.9622914

In [17]:
# Dr. Strange
similar_movies.tolist().index(1082)

672