In [None]:
import os
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import sys

# Get the current working directory of the Jupyter notebook
notebook_directory = os.getcwd()
# Assuming the notebook is in the 'bin/' folder, add the parent directory to sys.path
parent_directory = os.path.dirname(notebook_directory)
sys.path.append(parent_directory)

In [None]:
import timeit
import json
from pathlib import Path

cache_file = Path("movie_synopsis_cache.json")

# Function to load cache data from a file
def load_cache():
    if cache_file.is_file() and cache_file.stat().st_size > 0:
        with open(cache_file, 'r') as file:
            try:
                return json.load(file)
            except json.JSONDecodeError:
                return {}
    return {}

# Function to save cache data to a file
def save_cache(cache):
    with open(cache_file, 'w') as file:
        json.dump(cache, file, indent=4)

In [None]:
from imdb import Cinemagoer
# Create an instance of the Cinemagoer class
cg = Cinemagoer()

# Create methods to fetch movie details given a list of imdb movie ids
def get_movie_details(imdb_id):
    cache = load_cache()

    # Check if the movie data is in cache
    if imdb_id in cache:
        print("Retrieved from cache.")
        return cache[imdb_id]

    # If not in cache, get movie data
    start = timeit.default_timer()
    cg_imdb_id = imdb_id.replace("tt", "")
    movie = cg.get_movie(cg_imdb_id)
    end = timeit.default_timer()
    print("get_movie_details took {} seconds to run".format(end - start))
    result = {}

    keys = ["title", "genres", "runtimes", "original air date", "rating", "votes", "imdbID", "language codes", "year", "director", "cast"]
    for key in keys:
        if key not in movie:
            result[key] = None
        elif key == "cast":
            result[key] = [c.personID for c in movie[key][:5]]
        elif key == "director":
            result[key] = [c.personID for c in movie[key]]
        else:
            result[key] = movie.get(key, None)

    synopsis_present = True if "synopsis" in movie and len(movie["synopsis"]) > 0 else False
    plot_present = True if "plot" in movie and len(movie["plot"]) > 0 else False
    if synopsis_present and plot_present:
        result["synopsis"] = movie["synopsis"][0]
        result["plot"] = movie["plot"][0]
    elif synopsis_present:
        result["synopsis"] = movie["synopsis"][0]
        result["plot"] = movie["synopsis"][0]
    elif plot_present:
        result["synopsis"] = movie["plot"][0]
        result["plot"] = movie["plot"][0]
    else:
        result["synopsis"] = ""
        result["plot"] = ""

    # Save the new data to cache
    cache[imdb_id] = result
    save_cache(cache)
    return result

from tenacity import retry, stop_after_attempt, wait_fixed
@retry(stop=stop_after_attempt(3), wait=wait_fixed(1))  # Retries up to 3 times with a 1-second wait between tries
def get_movie_details_with_retry(movie):
    return get_movie_details(movie)

def get_movie_details_as_data_frame(movie_list):
    all_movie_details = {}
    for movie in movie_list:
        all_movie_details[movie] = get_movie_details_with_retry(movie)
    all_movie_details = [all_movie_details[movie] for movie in movie_list if movie in all_movie_details]
    return pd.json_normalize(all_movie_details)

print(get_movie_details_as_data_frame(["tt6166392", "tt4046784"]))

In [None]:
import requests
from datetime import datetime, timedelta
import time

# Given a date range, fetch all the movies that were released during that period.
# Additional filters like language/minimum vote count can also be specified
MINIMUM_VOTE_COUNT = 50
LANGUAGES = ["en"]

def get_tmdb_movies_in_range(start, end):
    api_key = '0b2cc6b5655e6c00206bd71118d1156f'
    languages = ",".join(LANGUAGES)
    url = f'https://api.themoviedb.org/3/discover/movie?api_key={api_key}&primary_release_date.gte={start}&primary_release_date.lte={end}&include_adult=false&include_video=false&with_original_language={languages}&page=1&sort_by=popularity.desc&vote_count.gte={MINIMUM_VOTE_COUNT}'
    response = requests.get(url)
    data = response.json()
    total_pages = data["total_pages"]
    total_results = data["total_results"]
    movies_in_date_range = []
    print(f"total_results: {total_results}")

    for page in range(total_pages):
        try:
            url = f'https://api.themoviedb.org/3/discover/movie?api_key={api_key}&primary_release_date.gte={start}&primary_release_date.lte={end}&include_adult=false&include_video=false&with_original_language={languages}&page={page+1}&sort_by=popularity.desc'
            response = requests.get(url)
            data = response.json()
            movies_in_date_range.extend(data["results"])
        except Exception as e:
            print(e)
            time.sleep(.1)
    print(f"total_results extracted: {len(movies_in_date_range)}")
    return movies_in_date_range

def get_imdb_ids_for_tmdb_movies_in_range(start, end):
    api_key = '0b2cc6b5655e6c00206bd71118d1156f'

    movies = get_tmdb_movies_in_range(start, end)
    imdb_ids = []
    found_movies = []
    low_votes_movies = []
    missing_movies = []
    for movie in movies:
        try:
            id = movie["id"]
            url = f"https://api.themoviedb.org/3/movie/{id}/external_ids?api_key={api_key}"
            response = requests.get(url)
            data = response.json()
            imdb_id = data["imdb_id"]
            if imdb_id is not None:
                if int(movie["vote_count"]) >= MINIMUM_VOTE_COUNT:
                    imdb_ids.append(imdb_id)
                    found_movies.append((id, movie["original_title"], movie["vote_count"]))
                else:
                    low_votes_movies.append((id, movie["original_title"], movie["vote_count"]))
            else:
                missing_movies.append((id, movie["original_title"], movie["vote_count"]))
        except Exception as e:
            print(e)
            time.sleep(1)
    print(f"Number of imdb ids extracted: {len(imdb_ids)}")
    print(f"Missing movies: {missing_movies}")
    print(f"Low votes movies: {low_votes_movies}")
    print(f"Found movies: {found_movies}")
    return imdb_ids

yesterday = (datetime.now() - timedelta(1)).strftime('%Y-%m-%d') # Eventually, we will use this in the cron job that runs to populate for the last 'n' days
start = "2023-01-01"
end = "2023-12-31"
imdb_movie_ids = get_imdb_ids_for_tmdb_movies_in_range(start, end)

print(imdb_movie_ids)

In [None]:
# For each movie discovered, fetch full details using Cinemagoer
movie_details_df = get_movie_details_as_data_frame(imdb_movie_ids)
titles_with_synopsis = movie_details_df['title'].tolist()
synopsis_list = movie_details_df['synopsis'].tolist()

In [None]:
from transformers import BertTokenizer, BertModel
import numpy as np

# Initialize tokenizer and model from pre-trained BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to create embeddings for a list of synopses using BERT
def get_bert_embeddings(synopses):
    embeddings = []
    for synopsis in synopses:
        # Tokenize the synopsis and convert to input format expected by BERT
        inputs = tokenizer(synopsis, return_tensors='pt', padding=True, truncation=True, max_length=512)
        # Get the output from BERT model
        outputs = model(**inputs)
        # Use the mean of the last hidden state as the embedding
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()
        embeddings.append(embedding)
    return embeddings

# Create BERT embeddings for the synopses
bert_embeddings = get_bert_embeddings(synopsis_list)
bert_embeddings_matrix = np.array(bert_embeddings)
print(bert_embeddings_matrix.shape)

In [None]:
#This calculates Cosines similarity between 2 vectors (movies).

#Note: Cosine similarity expects 2D matrices.
#To perform cosine similarity on vectors, remember to reshape the vector in the 2D shape (1, N), where N is the vector length.
#to-do: Update this function to become a weighted cosine, using weights from a file.
def get_cosine_similarity(movie_vector_1, movie_vector_2):
    cosine_sim = cosine_similarity(movie_vector_1, movie_vector_2)
    return cosine_sim

#Get the top movies relating to a given movie vector using cosine similarity.
#2 use cases for this:
# 1. given_movie_vector = a specific movie's embeddings. This will return top movies relating to that movie.
# 2. given_movie_vector = user_profile's vector. This will return top movies recommended for this user.

def get_top_movies_cosine(tfidf_matrix, given_movie_vector, movie_titles, top_n=5):

    # Compute cosine similarity between the movie at movie_index and all movies in the matrix
    cosine_similarities = get_cosine_similarity(given_movie_vector, tfidf_matrix).flatten()

    # Get the indices of the top_n movies with the highest cosine similarity scores
    # Use argsort and reverse it with [::-1] to get the indices in descending order of similarity
    # Skip the first one as it is the movie itself with a similarity of 1
    similar_indices = cosine_similarities.argsort()[::-1][1:top_n+1]

    # Get the scores for the top_n movies
    similar_scores = cosine_similarities[similar_indices]

    # Combine indices and scores into a list of tuples and return
    top_movies = [(movie_titles[index], index, score) for index, score in zip(similar_indices, similar_scores)]

    print(f"Top similar movies to the provided movie vector:\n")
    for num, (title, index, score) in enumerate(top_movies, start = 1):
        print(f"{num}. \"{title}\" at ROW {index} with similarity score: {score}")

    return top_movies

In [None]:
for i, title in enumerate(titles_with_synopsis):
    print(i, title)

get_top_movies_cosine(bert_embeddings_matrix, bert_embeddings_matrix[233].reshape(1, -1), titles_with_synopsis, 5)
