In [26]:
import glob
import json
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity

## V1 Taking the first embedding only

In [76]:
# take the first summary embedding only
embeddings_v1 = dict()
for embedding_path in glob.glob('../data/embeddings_v1/*/0.txt'):
    movie_id = embedding_path.split('/')[3]
    with open(embedding_path, 'r') as fp:
        embeddings_v1[movie_id] = [float(val) for val in fp.read().split(',')]

In [84]:
embeddings_v1_matrix = np.array([embedding for embedding in embeddings_v1.values()])

In [85]:
embeddings_v1_matrix.shape

(10904, 1536)

In [86]:
with open('../data/embeddings_v1.npy', 'wb') as fp:
    np.save(fp, embeddings_v1_matrix)

In [87]:
with open('../data/movie_ids.txt', 'w') as fp:
    fp.write(', '.join(list(embeddings_v1.keys())))

In [88]:
# is this faster?
with open('../data/embeddings_v1.npy', 'rb') as fp:
    loaded_embeddings_matrix = np.load(fp)
with open('../data/movie_ids.txt', 'r') as fp:
    movie_ids = {movie: idx for idx, movie in enumerate(fp.read().split(', '))}

In [89]:
len(movie_ids.keys())

10904

In [90]:
# only save the movies we have embeddings for
movies = []
for summary_path in glob.glob('../data/movies/*/summary.json'):
    movie_id = summary_path.split('/')[3]
    if movie_id not in movie_ids.keys():
        continue
    imdb_id = summary_path.split('/')[3].split('_')[0]
    with open(summary_path, 'r') as fp:
        movie = json.load(fp)
        movie['movie_id'] = movie_id
        movie['id'] = imdb_id
        movies.append(movie)
movies_df = pd.DataFrame(movies)

In [91]:
movies_df.shape

(10904, 12)

In [92]:
# load all movie summary data into memory
plot_summaries = defaultdict(list)
for plot_summary_path in glob.glob('../data/movies/*/plot_summaries/*.txt'):
    imdb_id = plot_summary_path.split('/')[3].split('_')[0]
    with open(plot_summary_path, 'r') as fp:
        summary = fp.read()
        plot_summaries[imdb_id].append((summary, len(summary)))

In [93]:
shortest_plot_summary = dict()
for movie, summaries in plot_summaries.items():
    shortest_plot_summary[movie] = sorted(summaries, key=lambda x: x[1])[0][0]

In [94]:
movies_df['plot'] = movies_df.id.map(shortest_plot_summary)

In [95]:
movies_df['release_year'] = pd.to_datetime(movies_df.release_date.str.split('(').apply(lambda x: x[0]), format='mixed').dt.year

In [96]:
movies_df.to_csv('../data/movie_summary.csv', index=False)

In [97]:
movies_df[movies_df.id == 'tt1245112']

Unnamed: 0,title,genres,director,starring,rating,release_date,runtime,certificate,img_url,plot,movie_id,id,release_year


## V2 Taking the average embedding

In [98]:
# take all the summary embeddings
embeddings_v2 = defaultdict(list)
for embedding_path in glob.glob('../data/embeddings_v1/*/*.txt'):
    movie_id = embedding_path.split('/')[3]
    with open(embedding_path, 'r') as fp:
        embedding = [float(val) for val in fp.read().split(',')]
    embeddings_v2[movie_id].append(embedding)

In [99]:
len(embeddings_v2.keys())

10904

In [100]:
len(embeddings_v1.keys())

10904

In [101]:
len(movie_ids.keys())

10904

In [102]:
# take the average
embedding_v2_matrix = list()
for movie_id in movie_ids:
    embedding_v2_matrix.append(np.mean(np.array(embeddings_v2[movie_id]), axis=0))
embedding_v2_matrix = np.array(embedding_v2_matrix)

In [103]:
with open('../data/embeddings_v2.npy', 'wb') as fp:
    np.save(fp, embedding_v2_matrix)

In [104]:
embedding_v2_matrix.shape

(10904, 1536)

## Load and make a prediction

In [172]:
with open('data/embeddings_v1.npy', 'rb') as fp:
    loaded_embeddings_matrix = np.load(fp)
with open('data/movie_ids.txt', 'r') as fp:
    movie_ids = {movie: idx for idx, movie in enumerate(fp.read().split(', '))}
with open('data/movie_ids.txt', 'r') as fp:
    movie_ids_reverse = {idx: movie for idx, movie in enumerate(fp.read().split(', '))}

In [165]:
selected_movies = ['tt0086250_scarface', 'tt1119646_the_hangover', 'tt0448157_hancock']
selected_movie_ids = [movie_ids[movie] for movie in selected_movies]

In [149]:
user_embedding = []
for movie in selected_movies:
    user_embedding.append(loaded_embeddings_matrix[movie_ids[movie]])
user_embedding = np.mean(np.array(user_embedding), axis=0)

In [151]:
user_embedding.shape

(1536,)

In [154]:
similarity_matrix = cosine_similarity(loaded_embeddings_matrix, user_embedding.reshape(1, -1))

In [166]:
similarities = sorted([(i, similarity[0]) for i, similarity in enumerate(similarity_matrix)], key=lambda x: x[1], reverse=True)
similar_movies = list()
for idx, similarity in similarities:
    if len(similar_movies) == 10:
        break
    if idx in selected_movie_ids:
        continue
    similar_movies.append(idx)

In [174]:
[movie_ids_reverse[id] for id in similar_movies]

['tt0032209_angels_over_broadway',
 'tt3276924_heist',
 'tt0091875_running_scared',
 'tt0375063_sideways',
 'tt1389096_stand_up_guys',
 'tt0104952_my_cousin_vinny',
 'tt0796375_you_kill_me',
 'tt10365998_infinity_pool',
 'tt0120620_brokedown_palace',
 'tt0104850_memoirs_of_an_invisible_man']