In [None]:
import os
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import sys

# Get the current working directory of the Jupyter notebook
notebook_directory = os.getcwd()
# Assuming the notebook is in the 'bin/' folder, add the parent directory to sys.path
parent_directory = os.path.dirname(notebook_directory)
sys.path.append(parent_directory)

In [None]:
from imdb import Cinemagoer
import timeit
import json
from pathlib import Path

cache_file = Path("movie_synopsis_cache.json")
# Create an instance of the Cinemagoer class
cg = Cinemagoer()

# Function to load cache data from a file
def load_cache():
    if cache_file.is_file() and cache_file.stat().st_size > 0:
        with open(cache_file, 'r') as file:
            try:
                return json.load(file)
            except json.JSONDecodeError:
                return {}
    return {}

# Function to save cache data to a file
def save_cache(cache):
    with open(cache_file, 'w') as file:
        json.dump(cache, file, indent=4)

def get_movie_description(imdb_id):
    cache = load_cache()

    # Check if the movie data is in cache
    if imdb_id in cache:
        print("Retrieved from cache.")
        return cache[imdb_id]

    # If not in cache, get movie data
    start = timeit.default_timer()
    cg_imdb_id = imdb_id.replace("tt", "")
    movie = cg.get_movie(cg_imdb_id)
    end = timeit.default_timer()
    print("get_movie_description took {} seconds to run".format(end - start))
    result = {}
    for info in movie.current_info:
        if info in movie:
            result[info] = movie[info]

    # Save the new data to cache
    cache[imdb_id] = result
    save_cache(cache)
    return result

imdb_id = 'tt3469046'  # Example IMDb ID 'tt0111161' for "The Shawshank Redemption"
print(get_movie_description(imdb_id))

In [None]:
# Filter movies from movies_metadata.csv
def contains_english(lang_list):
    try:
        return "[{'iso_639_1': 'en', 'name': 'English'}]" in lang_list
    except:
        return False

# Reading from the movies_metadata.csv file
# Required fields:
# 1. imdb_id
# 2. release_date
# 3. spoken_languages
# 4. vote_count
def load_english_movies(years="", min_reviews=0):
    year_filter = "|".join(years)
    df_extended = pd.read_csv("../movies_metadata.csv")
    filtered_df_extended = df_extended[(df_extended['release_date'].str.contains(year_filter, na=False)) & df_extended['spoken_languages'].apply(contains_english)]
    return filtered_df_extended[filtered_df_extended['vote_count'] >= min_reviews]

filtered_df_extended = load_english_movies(["2015", "2016", "2017"], 500)
imdb_movie_ids = filtered_df_extended['imdb_id'].tolist()
print(imdb_movie_ids)

In [None]:
def get_synopsis_for_movie(imdb_movie_id):
    try:
        movie_description = get_movie_description(imdb_movie_id)
        movie_synopsis = ""
        if "synopsis" in movie_description and len(movie_description["synopsis"]) > 0:
            movie_synopsis = movie_description["synopsis"][0]
        elif "plot" in movie_description and len(movie_description["plot"]) > 0:
            movie_synopsis = movie_description["plot"][0]
        return movie_synopsis
    except:
        print(f"Movie with id {imdb_movie_id} ran into an error! Skipping...")
        return None

filtered_df_extended['cleaned_synopsis'] = filtered_df_extended['imdb_id'].apply(get_synopsis_for_movie)
synopsis_list = filtered_df_extended['cleaned_synopsis'].tolist()
titles_with_synopsis = filtered_df_extended['title'].tolist()
print(filtered_df_extended['cleaned_synopsis'].sample(n=2))

In [None]:
from transformers import BertTokenizer, BertModel
import numpy as np

# Initialize tokenizer and model from pre-trained BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to create embeddings for a list of synopses using BERT
def get_bert_embeddings(synopses):
    embeddings = []
    for synopsis in synopses:
        # Tokenize the synopsis and convert to input format expected by BERT
        inputs = tokenizer(synopsis, return_tensors='pt', padding=True, truncation=True, max_length=512)
        # Get the output from BERT model
        outputs = model(**inputs)
        # Use the mean of the last hidden state as the embedding
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()
        embeddings.append(embedding)
    return embeddings

# Create BERT embeddings for the synopses
bert_embeddings = get_bert_embeddings(synopsis_list)
bert_embeddings_matrix = np.array(bert_embeddings)
print(bert_embeddings_matrix.shape)

In [None]:
#This calculates Cosines similarity between 2 vectors (movies).

#Note: Cosine similarity expects 2D matrices.
#To perform cosine similarity on vectors, remember to reshape the vector in the 2D shape (1, N), where N is the vector length.
#to-do: Update this function to become a weighted cosine, using weights from a file.
def get_cosine_similarity(movie_vector_1, movie_vector_2):
    cosine_sim = cosine_similarity(movie_vector_1, movie_vector_2)
    return cosine_sim

#Get the top movies relating to a given movie vector using cosine similarity.
#2 use cases for this:
# 1. given_movie_vector = a specific movie's embeddings. This will return top movies relating to that movie.
# 2. given_movie_vector = user_profile's vector. This will return top movies recommended for this user.

def get_top_movies_cosine(tfidf_matrix, given_movie_vector, movie_titles, top_n=5):

    # Compute cosine similarity between the movie at movie_index and all movies in the matrix
    cosine_similarities = get_cosine_similarity(given_movie_vector, tfidf_matrix).flatten()

    # Get the indices of the top_n movies with the highest cosine similarity scores
    # Use argsort and reverse it with [::-1] to get the indices in descending order of similarity
    # Skip the first one as it is the movie itself with a similarity of 1
    similar_indices = cosine_similarities.argsort()[::-1][1:top_n+1]

    # Get the scores for the top_n movies
    similar_scores = cosine_similarities[similar_indices]

    # Combine indices and scores into a list of tuples and return
    top_movies = [(movie_titles[index], index, score) for index, score in zip(similar_indices, similar_scores)]

    print(f"Top similar movies to the provided movie vector:\n")
    for num, (title, index, score) in enumerate(top_movies, start = 1):
        print(f"{num}. \"{title}\" at ROW {index} with similarity score: {score}")

    return top_movies

In [None]:
for i, title in enumerate(titles_with_synopsis):
    print(i, title)

get_top_movies_cosine(bert_embeddings_matrix, bert_embeddings_matrix[204].reshape(1, -1), titles_with_synopsis, 5)
