In [152]:
import glob
import json
import numpy as np
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity

## V1 Taking the first embedding only

In [142]:
# take the first summary embedding only
embeddings_v1 = dict()
for embedding_path in glob.glob('data/embeddings_v1/*/0.txt'):
    movie_id = embedding_path.split('/')[2]
    with open(embedding_path, 'r') as fp:
        embeddings_v1[movie_id] = [float(val) for val in fp.read().split(',')]

In [143]:
embeddings_v1_matrix = np.array([embedding for embedding in embeddings_v1.values()])

In [144]:
embeddings_v1_matrix.shape

(10904, 1536)

In [145]:
with open('data/embeddings_v1.npy', 'wb') as fp:
    np.save(fp, embeddings_v1_matrix)

In [116]:
with open('data/movie_ids.txt', 'w') as fp:
    fp.write(', '.join(list(embeddings_v1.keys())))

In [118]:
# is this faster?
with open('data/embeddings_v1.npy', 'rb') as fp:
    loaded_embeddings_matrix = np.load(fp)
with open('data/movie_ids.txt', 'r') as fp:
    movie_ids = {movie: idx for idx, movie in enumerate(fp.read().split(', '))}

In [119]:
len(movie_ids.keys())

10904

## V2 Taking the average embedding

In [125]:
# take all the summary embeddings
embeddings_v2 = defaultdict(list)
for embedding_path in glob.glob('data/embeddings_v1/*/*.txt'):
    movie_id = embedding_path.split('/')[2]
    with open(embedding_path, 'r') as fp:
        embedding = [float(val) for val in fp.read().split(',')]
    embeddings_v2[movie_id].append(embedding)

In [112]:
len(embeddings_v2.keys())

10904

In [111]:
len(embeddings_v1.keys())

10904

In [120]:
len(movie_ids.keys())

10904

In [126]:
# take the average
embedding_v2_matrix = list()
for movie_id in movie_ids:
    embedding_v2_matrix.append(np.mean(np.array(embeddings_v2[movie_id]), axis=0))
embedding_v2_matrix = np.array(embedding_v2_matrix)

In [127]:
with open('data/embeddings_v2.npy', 'wb') as fp:
    np.save(fp, embedding_v2_matrix)

In [130]:
embedding_v2_matrix.shape

(10904, 1536)

## Load and make a prediction

In [172]:
with open('data/embeddings_v1.npy', 'rb') as fp:
    loaded_embeddings_matrix = np.load(fp)
with open('data/movie_ids.txt', 'r') as fp:
    movie_ids = {movie: idx for idx, movie in enumerate(fp.read().split(', '))}
with open('data/movie_ids.txt', 'r') as fp:
    movie_ids_reverse = {idx: movie for idx, movie in enumerate(fp.read().split(', '))}

In [165]:
selected_movies = ['tt0086250_scarface', 'tt1119646_the_hangover', 'tt0448157_hancock']
selected_movie_ids = [movie_ids[movie] for movie in selected_movies]

In [149]:
user_embedding = []
for movie in selected_movies:
    user_embedding.append(loaded_embeddings_matrix[movie_ids[movie]])
user_embedding = np.mean(np.array(user_embedding), axis=0)

In [151]:
user_embedding.shape

(1536,)

In [154]:
similarity_matrix = cosine_similarity(loaded_embeddings_matrix, user_embedding.reshape(1, -1))

In [166]:
similarities = sorted([(i, similarity[0]) for i, similarity in enumerate(similarity_matrix)], key=lambda x: x[1], reverse=True)
similar_movies = list()
for idx, similarity in similarities:
    if len(similar_movies) == 10:
        break
    if idx in selected_movie_ids:
        continue
    similar_movies.append(idx)

In [174]:
[movie_ids_reverse[id] for id in similar_movies]

['tt0032209_angels_over_broadway',
 'tt3276924_heist',
 'tt0091875_running_scared',
 'tt0375063_sideways',
 'tt1389096_stand_up_guys',
 'tt0104952_my_cousin_vinny',
 'tt0796375_you_kill_me',
 'tt10365998_infinity_pool',
 'tt0120620_brokedown_palace',
 'tt0104850_memoirs_of_an_invisible_man']