In [6]:
import os
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler, StandardScaler, normalize, RobustScaler
from scipy import sparse
import sys

# Get the current working directory of the Jupyter notebook
notebook_directory = os.getcwd()
# Assuming the notebook is in the 'bin/' folder, add the parent directory to sys.path
parent_directory = os.path.dirname(notebook_directory)
sys.path.append(parent_directory)

In [7]:
import timeit
import json
from pathlib import Path

cache_file = Path("movie_synopsis_cache.json")

# Function to load cache data from a file
def load_cache():
    if cache_file.is_file() and cache_file.stat().st_size > 0:
        with open(cache_file, 'r') as file:
            try:
                return json.load(file)
            except json.JSONDecodeError:
                return {}
    return {}

# Function to save cache data to a file
def save_cache(cache):
    with open(cache_file, 'w') as file:
        json.dump(cache, file, indent=4)

In [8]:
from imdb import Cinemagoer
# Create an instance of the Cinemagoer class
cg = Cinemagoer()

# Create methods to fetch movie details given a list of imdb movie ids
def get_movie_details(imdb_id):
    cache = load_cache()

    # Check if the movie data is in cache
    if imdb_id in cache:
        print("Retrieved from cache.")
        return cache[imdb_id]

    # If not in cache, get movie data
    start = timeit.default_timer()
    cg_imdb_id = imdb_id.replace("tt", "")
    movie = cg.get_movie(cg_imdb_id)
    end = timeit.default_timer()
    print("get_movie_details took {} seconds to run".format(end - start))
    result = {}

    keys = ["title", "genres", "runtimes", "original air date", "rating", "votes", "imdbID", "language codes", "year", "director", "cast"]
    for key in keys:
        if key not in movie:
            result[key] = None
        elif key == "cast":
            result[key] = [c.personID for c in movie[key][:5]]
        elif key == "director":
            result[key] = [c.personID for c in movie[key]]
        else:
            result[key] = movie.get(key, None)

    synopsis_present = True if "synopsis" in movie and len(movie["synopsis"]) > 0 else False
    plot_present = True if "plot" in movie and len(movie["plot"]) > 0 else False
    if synopsis_present and plot_present:
        result["synopsis"] = movie["synopsis"][0]
        result["plot"] = movie["plot"][0]
    elif synopsis_present:
        result["synopsis"] = movie["synopsis"][0]
        result["plot"] = movie["synopsis"][0]
    elif plot_present:
        result["synopsis"] = movie["plot"][0]
        result["plot"] = movie["plot"][0]
    else:
        result["synopsis"] = ""
        result["plot"] = ""

    # Save the new data to cache
    print("trying to save ", imdb_id)
    cache[imdb_id] = result
    save_cache(cache)
    return result

from tenacity import retry, stop_after_attempt, wait_fixed
@retry(stop=stop_after_attempt(3), wait=wait_fixed(1))  # Retries up to 3 times with a 1-second wait between tries
def get_movie_details_with_retry(movie):
    return get_movie_details(movie)

def get_movie_details_as_data_frame(movie_list):
    all_movie_details = {}
    for movie in movie_list:
        all_movie_details[movie] = get_movie_details_with_retry(movie)
    all_movie_details = [all_movie_details[movie] for movie in movie_list if movie in all_movie_details]
    return pd.json_normalize(all_movie_details)

print(get_movie_details_as_data_frame(["tt6166392", "tt4046784"]))

Retrieved from cache.
Retrieved from cache.
                            title  \
0                           Wonka   
1  Maze Runner: The Scorch Trials   

                                          genres runtimes  original air date  \
0  [Adventure, Comedy, Family, Fantasy, Musical]    [116]  15 Dec 2023 (USA)   
1          [Action, Adventure, Sci-Fi, Thriller]    [131]  18 Sep 2015 (USA)   

   rating   votes   imdbID language codes  year   director  \
0     7.2   80003  6166392           [en]  2023  [1653753]   
1     6.3  271655  4046784           [en]  2015  [1226871]   

                                              cast  \
0  [3154303, 15067637, 0564133, 5728367, 13588198]   
1    [3729721, 3859624, 2546012, 1032473, 2766708]   

                                            synopsis  \
0  A ship sails towards land and a young Willy Wo...   
1  The opening scene shows hundreds of people cro...   

                                                plot  
0  With dreams of opening a s

In [9]:
import requests
from datetime import datetime, timedelta
import time

# Given a date range, fetch all the movies that were released during that period.
# Additional filters like language/minimum vote count can also be specified
MINIMUM_VOTE_COUNT = 200
LANGUAGES = ["en"]

def get_tmdb_movies_in_range(start, end):
    api_key = '0b2cc6b5655e6c00206bd71118d1156f'
    languages = ",".join(LANGUAGES)
    url = f'https://api.themoviedb.org/3/discover/movie?api_key={api_key}&primary_release_date.gte={start}&primary_release_date.lte={end}&include_adult=false&include_video=false&with_original_language={languages}&page=1&sort_by=popularity.desc&vote_count.gte={MINIMUM_VOTE_COUNT}'
    response = requests.get(url)
    data = response.json()
    total_pages = data["total_pages"]
    total_results = data["total_results"]
    movies_in_date_range = []
    print(f"total_results: {total_results}")

    for page in range(total_pages):
        try:
            url = f'https://api.themoviedb.org/3/discover/movie?api_key={api_key}&primary_release_date.gte={start}&primary_release_date.lte={end}&include_adult=false&include_video=false&with_original_language={languages}&page={page+1}&sort_by=popularity.desc'
            response = requests.get(url)
            data = response.json()
            movies_in_date_range.extend(data["results"])
        except Exception as e:
            print(e)
            time.sleep(.1)
    print(f"total_results extracted: {len(movies_in_date_range)}")
    return movies_in_date_range

def get_imdb_ids_for_tmdb_movies_in_range(start, end):
    api_key = '0b2cc6b5655e6c00206bd71118d1156f'

    movies = get_tmdb_movies_in_range(start, end)
    imdb_ids = []
    found_movies = []
    low_votes_movies = []
    missing_movies = []
    for movie in movies:
        try:
            id = movie["id"]
            url = f"https://api.themoviedb.org/3/movie/{id}/external_ids?api_key={api_key}"
            response = requests.get(url)
            data = response.json()
            imdb_id = data["imdb_id"]
            if imdb_id is not None:
                if int(movie["vote_count"]) >= MINIMUM_VOTE_COUNT:
                    imdb_ids.append(imdb_id)
                    found_movies.append((id, movie["original_title"], movie["vote_count"]))
                else:
                    low_votes_movies.append((id, movie["original_title"], movie["vote_count"]))
            else:
                missing_movies.append((id, movie["original_title"], movie["vote_count"]))
        except Exception as e:
            print(e)
            time.sleep(1)
    print(f"Number of imdb ids extracted: {len(imdb_ids)}")
    print(f"Missing movies: {missing_movies}")
    print(f"Low votes movies: {low_votes_movies}")
    print(f"Found movies: {found_movies}")
    return imdb_ids

yesterday = (datetime.now() - timedelta(1)).strftime('%Y-%m-%d') # Eventually, we will use this in the cron job that runs to populate for the last 'n' days
start = "2023-01-01"
end = "2023-12-31"
imdb_movie_ids = get_imdb_ids_for_tmdb_movies_in_range(start, end)

print(imdb_movie_ids)

total_results: 183
total_results extracted: 200
Number of imdb ids extracted: 124
Missing movies: [(1205781, 'Dora: Say Hola to Adventure!', 11), (1082314, 'Insta Gay', 2)]
Low votes movies: [(1049948, 'Vikings: Battle of Heirs', 10), (1081620, 'The Weapon', 21), (984249, 'Ruthless', 36), (1211483, 'Skal - Fight for Survival', 55), (1161663, 'Ghost Project', 24), (1079394, 'The Five', 28), (982940, 'Due Justice', 49), (880100, 'Fear', 41), (1146148, 'Sorry, Charlie', 5), (899445, 'Deep Fear', 147), (927107, 'The Bricklayer', 135), (1034411, 'The Movie Star and the Cowboy', 10), (1189798, 'Squealer', 35), (939335, 'Muzzle', 186), (676727, 'The Inventor', 12), (1060090, 'The Collective', 94), (1101582, 'Good Burger 2', 65), (1061855, 'Richard the Stork and the Mystery of the Great Jewel', 8), (1047925, 'Come Out Fighting', 35), (935906, 'Scrapper', 53), (1035982, 'Hell House LLC Origins: The Carmichael Manor', 76), (981347, 'The Loch Ness Horror', 3), (1027073, 'In the Land of Saints and

In [10]:
# For each movie discovered, fetch full details using Cinemagoer
movie_details_df = get_movie_details_as_data_frame(imdb_movie_ids)
titles_with_synopsis = movie_details_df['title'].tolist()
synopsis_list = movie_details_df['synopsis'].tolist()

Retrieved from cache.
Retrieved from cache.
Retrieved from cache.
Retrieved from cache.
Retrieved from cache.
Retrieved from cache.
Retrieved from cache.
Retrieved from cache.
Retrieved from cache.
Retrieved from cache.
Retrieved from cache.
Retrieved from cache.
Retrieved from cache.
Retrieved from cache.
Retrieved from cache.
Retrieved from cache.
Retrieved from cache.
Retrieved from cache.
Retrieved from cache.
Retrieved from cache.
Retrieved from cache.
Retrieved from cache.
Retrieved from cache.
Retrieved from cache.
Retrieved from cache.
Retrieved from cache.
Retrieved from cache.
Retrieved from cache.
Retrieved from cache.
Retrieved from cache.
Retrieved from cache.
Retrieved from cache.
Retrieved from cache.
Retrieved from cache.
Retrieved from cache.
Retrieved from cache.
Retrieved from cache.
Retrieved from cache.
Retrieved from cache.
Retrieved from cache.
Retrieved from cache.
Retrieved from cache.
Retrieved from cache.
Retrieved from cache.
Retrieved from cache.
Retrieved 

In [11]:
from transformers import BertTokenizer, BertModel
import numpy as np

# Initialize tokenizer and model from pre-trained BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to create embeddings for a list of synopses using BERT
def get_bert_embeddings(synopses):
    embeddings = []
    num_processed = 0
    for synopsis in synopses:
        # Tokenize the synopsis and convert to input format expected by BERT
        inputs = tokenizer(synopsis, return_tensors='pt', padding=True, truncation=True, max_length=512)
        # Get the output from BERT model
        outputs = model(**inputs)
        # Use the mean of the last hidden state as the embedding
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()
        embeddings.append(embedding)
        
        num_processed = num_processed + 1
        print(f'{len(synopses) - num_processed} remaining...')
    return embeddings

# Create BERT embeddings for the synopses
bert_embeddings = get_bert_embeddings(synopsis_list)
bert_embeddings_matrix = np.array(bert_embeddings)
print(bert_embeddings_matrix.shape)

123 remaining...
122 remaining...
121 remaining...
120 remaining...
119 remaining...
118 remaining...
117 remaining...
116 remaining...
115 remaining...
114 remaining...
113 remaining...
112 remaining...
111 remaining...
110 remaining...
109 remaining...
108 remaining...
107 remaining...
106 remaining...
105 remaining...
104 remaining...
103 remaining...
102 remaining...
101 remaining...
100 remaining...
99 remaining...
98 remaining...
97 remaining...
96 remaining...
95 remaining...
94 remaining...
93 remaining...
92 remaining...
91 remaining...
90 remaining...
89 remaining...
88 remaining...
87 remaining...
86 remaining...
85 remaining...
84 remaining...
83 remaining...
82 remaining...
81 remaining...
80 remaining...
79 remaining...
78 remaining...
77 remaining...
76 remaining...
75 remaining...
74 remaining...
73 remaining...
72 remaining...
71 remaining...
70 remaining...
69 remaining...
68 remaining...
67 remaining...
66 remaining...
65 remaining...
64 remaining...
63 remaining...


In [12]:
def get_unique_values_for_movie_property(movie_property):
    cast_lists = list(filter(lambda item: item is not None, movie_details_df[movie_property]))
    return {property for sublist in cast_lists for property in sublist}

def get_OHE_columns_for_property(property):
    unique_properties = get_unique_values_for_movie_property(property)
    columns = [f"{property_value}_{property}_OHE" for property_value in unique_properties] #OH = one-hot encoding
    return columns

#Build an empty df of all imdb_movie_id, bert encodings and additional properties of the movie.
def create_empty_movies_vector_df(bert_embeddings_matrix):
    
    movie_count, bert_dimensions = bert_embeddings_matrix.shape

    genre_columns = get_OHE_columns_for_property("genres")
    cast_columns = get_OHE_columns_for_property("cast")
    director_columns = get_OHE_columns_for_property("director")
    bert_columns = [f'embed_{i}_OHE' for i in range(bert_dimensions)]
    additional_columns = ['year_norm', 'runtimes_norm', 'rating_norm', 'votes_norm'] #these are already numerical values. They will be normalized to 0-1 range. 

    all_column_titles = ['imdb_movie_id', 'movie_title'] + bert_columns + cast_columns + director_columns + genre_columns + additional_columns
    #df dimensions = movie_count x (768 bert_dimensions + unique_genre_count + unique_cast_count + unique_director_count + 1 for year + 1 for runtime + 1 rating + 1 for votes)
    mega_df = pd.DataFrame(0, index = range(movie_count), columns = all_column_titles)
    mega_df["imdb_movie_id"] = movie_details_df['imdbID']
    mega_df["movie_title"] = movie_details_df['title']
    return mega_df
  
mega_df = create_empty_movies_vector_df(bert_embeddings_matrix)

In [13]:
def set_OHE_for_movie_property(imdb_id, property, movie_details_df, mega_df):
    movie_row = movie_details_df[movie_details_df['imdbID'] == imdb_id].iloc[0]
    property_values = movie_row[property]
    if property_values is not None:
        for property_value in property_values:
            property_col_name = f"{property_value}_{property}_OHE"
            condition = mega_df['imdb_movie_id'] == imdb_id
            mega_df.loc[condition, property_col_name] = 1

def build_movies_vector_df(mega_df, bert_embeddings_matrix):

    movie_count, bert_dimensions = bert_embeddings_matrix.shape
    assert movie_count == len(mega_df), "Row counts do not match."
    assert bert_dimensions == 768, "Embedding size is expected to be 768."

    for movie in movie_details_df.itertuples():
        imdb_id = movie.imdbID
        movie_index = movie.Index
        mega_df.iloc[movie_index, 2:2+bert_dimensions] = bert_embeddings_matrix[movie_index]

        set_OHE_for_movie_property(imdb_id, "genres", movie_details_df, mega_df)
        set_OHE_for_movie_property(imdb_id, "cast", movie_details_df, mega_df)
        set_OHE_for_movie_property(imdb_id, "director", movie_details_df, mega_df)

        condition = mega_df['imdb_movie_id'] == imdb_id
        mega_df.loc[condition, "year_norm"] = movie.year
        mega_df.loc[condition, "runtimes_norm"] = float(movie.runtimes[0]) if movie.runtimes is not None and len(movie.runtimes) > 0 else 0
        mega_df.loc[condition, "rating_norm"] = movie.rating
        mega_df.loc[condition, "votes_norm"] = movie.votes

    #Apply Selective Normalization (min-max scaling for year, and standardization
    minMaxScaler = MinMaxScaler()
    robustScaler = RobustScaler()

    for col in ['year_norm', 'runtimes_norm', 'rating_norm', 'votes_norm']:
        # Apply RobustScaler
        robust_scaled = robustScaler.fit_transform(mega_df[[col]])
        # Apply MinMaxScaler to the output of RobustScaler
        min_max_scaled = minMaxScaler.fit_transform(robust_scaled)
        # Option 1: Replace original column
        mega_df[col] = min_max_scaled


In [14]:
#Takes about 5 seconds to run for 284 movies x 2455 columns.
build_movies_vector_df(mega_df, bert_embeddings_matrix)
mega_df

Unnamed: 0,imdb_movie_id,movie_title,embed_0_OHE,embed_1_OHE,embed_2_OHE,embed_3_OHE,embed_4_OHE,embed_5_OHE,embed_6_OHE,embed_7_OHE,...,Drama_genres_OHE,Romance_genres_OHE,Thriller_genres_OHE,Music_genres_OHE,Fantasy_genres_OHE,Crime_genres_OHE,year_norm,runtimes_norm,rating_norm,votes_norm
0,6495056,Migration,0.037469,-0.112327,0.315960,0.104982,0.569557,-0.091606,-0.162811,0.848821,...,0,0,0,0,1,0,1.0,0.378788,0.666667,0.013647
1,6166392,Wonka,-0.245949,-0.084754,0.330746,0.026666,0.412024,0.082788,0.202446,0.472888,...,0,0,0,0,1,0,1.0,0.545455,0.754386,0.127705
2,14230458,Poor Things,-0.241375,-0.030417,0.270869,-0.250320,0.588595,0.287686,0.400109,0.400080,...,1,1,0,0,0,0,1.0,0.671717,0.964912,0.087329
3,15398776,Oppenheimer,-0.328942,0.086324,-0.058186,-0.252259,0.300462,-0.032500,0.157125,0.463829,...,1,0,0,0,0,0,1.0,0.868687,0.964912,1.000000
4,26047818,Anyone But You,0.166215,-0.195031,0.368130,0.003242,0.345846,-0.206677,0.463624,0.542353,...,0,1,0,0,0,0,1.0,0.479798,0.666667,0.021243
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119,2183014,Love at First Sight,0.119597,-0.460413,0.486524,-0.197686,0.666635,0.101387,0.241150,0.839674,...,1,1,0,0,0,0,1.0,0.419192,0.684211,0.045582
120,16280138,Magic Mike's Last Dance,-0.314260,-0.135634,0.378015,-0.160352,0.414932,0.010165,0.220735,0.412693,...,1,0,0,0,0,0,1.0,0.525253,0.403509,0.023185
121,5635026,Peter Pan & Wendy,-0.370496,-0.008256,0.331004,-0.110533,0.771906,-0.096079,0.174939,0.565817,...,0,1,0,0,1,0,1.0,0.494949,0.263158,0.042124
122,13521006,Beau Is Afraid,-0.191838,0.019865,0.245550,-0.132557,0.572227,0.173554,0.324223,0.490928,...,1,0,0,0,0,0,1.0,0.863636,0.666667,0.086002


In [15]:
def find_indices_with_string(df, search_string):
    return [i for i, key in enumerate(df.columns) if search_string in key]

def get_weights(df):
    # Find the indices
    embedding_indices = find_indices_with_string(df, "embed")
    genres_indices = find_indices_with_string(df, "genres")
    cast_indices = find_indices_with_string(df, "cast")
    director_indices = find_indices_with_string(df, "director")
    year_norm_indices = find_indices_with_string(df, "year_norm")
    rating_norm_indices = find_indices_with_string(df, "rating_norm")
    votes_norm_indices = find_indices_with_string(df, "votes_norm")
    remaining_indices = [num for num in range(len(df.columns)) if num not in set(embedding_indices + genres_indices + cast_indices + director_indices + year_norm_indices + rating_norm_indices + votes_norm_indices)]

    # Weights allocation
    embedding_total_weight = 80
    genre_total_weight = 12
    cast_weight = 5
    director_weight = 5
    rating_weight = 5
    year_weight = 0
    votes_weight = 1

    # Initialize weights array with zeros
    weights = [0] * len(df.columns)

    # Function to distribute weights
    def distribute_weights(indices, total_weight):
        if indices:  # Avoid division by zero
            per_feature_weight = total_weight / len(indices)
            for index in indices:
                weights[index] = per_feature_weight

    # Distribute weights based on category
    distribute_weights(embedding_indices, embedding_total_weight)
    distribute_weights(genres_indices, genre_total_weight)
    distribute_weights(cast_indices, cast_weight)
    distribute_weights(director_indices, director_weight)
    distribute_weights(year_norm_indices, year_weight)
    distribute_weights(rating_norm_indices, rating_weight)
    distribute_weights(votes_norm_indices, votes_weight)
    distribute_weights(remaining_indices, 0)  # Assuming no additional weight for remaining features

    weights = np.array(weights)
    weights_series = pd.Series(weights, index=df.columns)
    return weights_series

def weighted_vectors(mega_df):
    df = mega_df.drop(columns=['imdb_movie_id', 'movie_title'])
    weights = get_weights(df)
    weighted_df = df.mul(weights, axis=1)
    
    non_float_columns = mega_df[['imdb_movie_id', 'movie_title']]
    weighted_df = pd.concat([non_float_columns, weighted_df], axis=1)

    return weighted_df

weighted_vectors(mega_df)

Unnamed: 0,imdb_movie_id,movie_title,embed_0_OHE,embed_1_OHE,embed_2_OHE,embed_3_OHE,embed_4_OHE,embed_5_OHE,embed_6_OHE,embed_7_OHE,...,Drama_genres_OHE,Romance_genres_OHE,Thriller_genres_OHE,Music_genres_OHE,Fantasy_genres_OHE,Crime_genres_OHE,year_norm,runtimes_norm,rating_norm,votes_norm
0,6495056,Migration,0.003903,-0.011701,0.032913,0.010936,0.059329,-0.009542,-0.016959,0.088419,...,0.000000,0.000000,0.0,0.0,0.545455,0.0,0.0,0.0,3.333333,0.013647
1,6166392,Wonka,-0.025620,-0.008829,0.034453,0.002778,0.042919,0.008624,0.021088,0.049259,...,0.000000,0.000000,0.0,0.0,0.545455,0.0,0.0,0.0,3.771930,0.127705
2,14230458,Poor Things,-0.025143,-0.003168,0.028215,-0.026075,0.061312,0.029967,0.041678,0.041675,...,0.545455,0.545455,0.0,0.0,0.000000,0.0,0.0,0.0,4.824561,0.087329
3,15398776,Oppenheimer,-0.034265,0.008992,-0.006061,-0.026277,0.031298,-0.003385,0.016367,0.048315,...,0.545455,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,4.824561,1.000000
4,26047818,Anyone But You,0.017314,-0.020316,0.038347,0.000338,0.036026,-0.021529,0.048294,0.056495,...,0.000000,0.545455,0.0,0.0,0.000000,0.0,0.0,0.0,3.333333,0.021243
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119,2183014,Love at First Sight,0.012458,-0.047960,0.050680,-0.020592,0.069441,0.010561,0.025120,0.087466,...,0.545455,0.545455,0.0,0.0,0.000000,0.0,0.0,0.0,3.421053,0.045582
120,16280138,Magic Mike's Last Dance,-0.032735,-0.014129,0.039377,-0.016703,0.043222,0.001059,0.022993,0.042989,...,0.545455,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,2.017544,0.023185
121,5635026,Peter Pan & Wendy,-0.038593,-0.000860,0.034480,-0.011514,0.080407,-0.010008,0.018223,0.058939,...,0.000000,0.545455,0.0,0.0,0.545455,0.0,0.0,0.0,1.315789,0.042124
122,13521006,Beau Is Afraid,-0.019983,0.002069,0.025578,-0.013808,0.059607,0.018079,0.033773,0.051138,...,0.545455,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,3.333333,0.086002


In [16]:
#This calculates Cosines similarity between 2 vectors (movies).

#Note: Cosine similarity expects 2D matrices.
#To perform cosine similarity on vectors, remember to reshape the vector in the 2D shape (1, N), where N is the vector length.
#to-do: Update this function to become a weighted cosine, using weights from a file.
def get_cosine_similarity(movie_vector_1, movie_vector_2):
    cosine_sim = cosine_similarity(movie_vector_1, movie_vector_2)
    return cosine_sim

#Get the top movies relating to a given movie vector using cosine similarity.
#2 use cases for this:
# 1. given_movie_vector = a specific movie's embeddings. This will return top movies relating to that movie.
# 2. given_movie_vector = user_profile's vector. This will return top movies recommended for this user.

def get_top_movies_cosine(tfidf_matrix, given_movie_vector, movie_titles, top_n=5):

    # Compute cosine similarity between the movie at movie_index and all movies in the matrix
    cosine_similarities = get_cosine_similarity(given_movie_vector, tfidf_matrix).flatten()

    # Get the indices of the top_n movies with the highest cosine similarity scores
    # Use argsort and reverse it with [::-1] to get the indices in descending order of similarity
    # Skip the first one as it is the movie itself with a similarity of 1
    similar_indices = cosine_similarities.argsort()[::-1][1:top_n+1]

    # Get the scores for the top_n movies
    similar_scores = cosine_similarities[similar_indices]

    # Combine indices and scores into a list of tuples and return
    top_movies = [(movie_titles[index], index, score) for index, score in zip(similar_indices, similar_scores)]

    print(f"Top similar movies to the provided movie vector:\n")
    for num, (title, index, score) in enumerate(top_movies, start = 1):
        print(f"{num}. \"{title}\" at ROW {index} with similarity score: {score}")

    return top_movies

In [17]:
for i, title in enumerate(titles_with_synopsis):
    print(i, title)

0 Migration
1 Wonka
2 Poor Things
3 Oppenheimer
4 Anyone But You
5 Aquaman and the Lost Kingdom
6 The Marvels
7 Wish
8 Fast X
9 The Family Plan
10 DogMan
11 Spider-Man: Across the Spider-Verse
12 Napoleon
13 Barbie
14 Freelance
15 The Super Mario Bros. Movie
16 Transformers: Rise of the Beasts
17 Elemental
18 The Hunger Games: The Ballad of Songbirds & Snakes
19 John Wick: Chapter 4
20 Trolls Band Together
21 Rebel Moon: Part One - A Child of Fire
22 The Flash
23 Expend4bles
24 Meg 2: The Trench
25 Leo
26 Mission: Impossible - Dead Reckoning Part One
27 Killers of the Flower Moon
28 Five Nights at Freddy's
29 PAW Patrol: The Mighty Movie
30 The Equalizer 3
31 Guardians of the Galaxy Vol. 3
32 Gran Turismo
33 Silent Night
34 One Life
35 The Iron Claw
36 The Creator
37 The Nun II
38 Blue Beetle
39 Retribution
40 Thanksgiving
41 Sound of Freedom
42 The Zone of Interest
43 Talk to Me
44 Indiana Jones and the Dial of Destiny
45 The Little Mermaid
46 Ruby Gillman: Teenage Kraken
47 Creed III

In [18]:
#Find the movie you want to get recommendations for
desired_row = mega_df[mega_df['movie_title'] == 'Oppenheimer'].index[0]
# desired_row = 16

print("Just BERT")
#Compare the above results with just Bert
get_top_movies_cosine(bert_embeddings_matrix, bert_embeddings_matrix[desired_row].reshape(1, -1), titles_with_synopsis, 5)

print("All properties")
mega_matrix = mega_df.drop(columns=['imdb_movie_id', 'movie_title']).values
#Print top movies from cosine similarity on the mega DF
get_top_movies_cosine(mega_matrix, mega_matrix[desired_row].reshape(1, -1), titles_with_synopsis, 5);

print("All properties with weighted cosine")
weighted_mega_df = weighted_vectors(mega_df)
weighted_mega_matrix = weighted_mega_df.drop(columns=['imdb_movie_id', 'movie_title']).values
#Print top movies from cosine similarity on the mega DF
get_top_movies_cosine(weighted_mega_matrix, weighted_mega_matrix[desired_row].reshape(1, -1), titles_with_synopsis, 5);

Just BERT
Top similar movies to the provided movie vector:

1. "Indiana Jones and the Dial of Destiny" at ROW 44 with similarity score: 0.9276425838470459
2. "The Pope's Exorcist" at ROW 57 with similarity score: 0.9089657068252563
3. "American Fiction" at ROW 81 with similarity score: 0.9088834524154663
4. "Saw X" at ROW 51 with similarity score: 0.9064996242523193
5. "Teenage Mutant Ninja Turtles: Mutant Mayhem" at ROW 59 with similarity score: 0.902595579624176
All properties
Top similar movies to the provided movie vector:

1. "Sound of Freedom" at ROW 41 with similarity score: 0.8256009068453405
2. "American Fiction" at ROW 81 with similarity score: 0.8224754813436945
3. "Indiana Jones and the Dial of Destiny" at ROW 44 with similarity score: 0.8188038442341339
4. "Napoleon" at ROW 12 with similarity score: 0.8102894983715512
5. "Poor Things" at ROW 2 with similarity score: 0.8100522845967266
All properties with weighted cosine
Top similar movies to the provided movie vector:

1. 

In [19]:
#Calculate updated user profile after they have voted on M movies. 
# M = 1 means immediate feedback loop. But it may not be ideal. It might bias our recommendations towards our initial dataset (High exploit, low explore)
# I think M = 5 or 10 might be better. 
# An even better idea is a hybrid of the above. M = 10 inititally, and after some votes M --> 1. 

def update_user_profile_batch(user_profile, movie_vectors, ratings, M):
    """
    Update the user profile based on a batch of movie ratings.

    :param user_profile: scipy.sparse matrix, the current user profile vector (1, N)
    :param movie_vectors: list of scipy.sparse matrices, the TF-IDF vectors of the rated movies [(1, N), (1, N), ...]
    :param ratings: list of str, the ratings for each movie ('like' or 'dislike')
    :param M: int, the number of ratings to process before updating the profile
    :return: scipy.sparse matrix, the updated user profile vector (1, N)
    """
    dislike_factor = 1/3 #we can tweak this to see impact on recommendations. 

    if len(movie_vectors) != len(ratings):
        raise ValueError("The number of movie vectors and ratings must be the same")

    if len(movie_vectors) < M:
        raise ValueError("The number of movie vectors must be at least M")

    # Initialize a temporary profile change vector
    profile_change = sparse.csr_matrix((1, user_profile.shape[1]))

    # Process each movie vector and rating
    for movie_vector, rating in zip(movie_vectors, ratings):
        if rating == 'like':
            profile_change += movie_vector
        elif rating == 'dislike':
            profile_change -= (dislike_factor * movie_vector)
        else:
            raise ValueError("Rating must be 'like' or 'dislike'")

    # Update the user profile after processing M ratings
    updated_profile = user_profile + profile_change

    # Normalize the updated profile
    updated_profile = updated_profile/len(ratings)

    return updated_profile

In [20]:
#B. For testing user profile recommendations.
#   Just list the movies and the votes. This code will return a new "seeded" user profile
movie_titles = ["Wonka", "Oppenheimer", "The Iron Claw", "Aquaman and the Lost Kingdom", "Family Switch", "Murder Mystery 2", "No Hard Feelings", "Insidious: The Red Door", "Shazam! Fury of the Gods"]
ratings = ['dislike', 'like', 'like', "dislike", "like", "like", "dislike", "like", "dislike"]

#Get the selected rows as a dataframe
selected_rows = pd.DataFrame()
for title in movie_titles:
    selected_rows = pd.concat([selected_rows, weighted_mega_df[weighted_mega_df['movie_title'] == title]], ignore_index=True)

#Take the selected movies df, and convert to list of sparse matrices (1, vector length), but remove the first two columns which are IMDB id and title.
selected_movie_vectors = [sparse.csr_matrix(row.reshape(1, -1)) for row in selected_rows.iloc[:, 2:].values]

#initialize empty user_profile of the right shape.
user_profile = sparse.csr_matrix((1, selected_movie_vectors[0].shape[1]), dtype= float)

updated_user_profile = update_user_profile_batch(user_profile, selected_movie_vectors, ratings, len(ratings))

In [21]:
#Get the top movies recommended for the above generated user profile
get_top_movies_cosine(weighted_mega_matrix, updated_user_profile, titles_with_synopsis, 100);


Top similar movies to the provided movie vector:

1. "Oppenheimer" at ROW 3 with similarity score: 0.9734712624433033
2. "American Fiction" at ROW 81 with similarity score: 0.972837594404523
3. "Sound of Freedom" at ROW 41 with similarity score: 0.9726494917913688
4. "Radical" at ROW 58 with similarity score: 0.9723845937485477
5. "One Life" at ROW 34 with similarity score: 0.9714877494053245
6. "Past Lives" at ROW 60 with similarity score: 0.9702603077636577
7. "The Holdovers" at ROW 78 with similarity score: 0.9685833692291301
8. "Creed III" at ROW 47 with similarity score: 0.9685137859061657
9. "Leave the World Behind" at ROW 72 with similarity score: 0.9680589163681984
10. "Talk to Me" at ROW 43 with similarity score: 0.9652212333916432
11. "Evil Dead Rise" at ROW 54 with similarity score: 0.9648235015646776
12. "Poor Things" at ROW 2 with similarity score: 0.9641789606363311
13. "May December" at ROW 89 with similarity score: 0.964168494299067
14. "Killers of the Flower Moon" at R

In [94]:
#For future convenience, we should have an easy way to go from IMDB ID -> title and vice versa:

id_to_title_dict = pd.Series(weighted_mega_df.movie_title.values,index=weighted_mega_df.imdb_movie_id).to_dict()
title_to_id_dict = pd.Series(weighted_mega_df.imdb_movie_id.values,index=weighted_mega_df.movie_title).to_dict()

movie_title = 'Wonka'  # Example movie title
movie_id = title_to_id_dict.get(movie_title)

print(f"The IMDb ID for '{movie_title}' is {movie_id}.")

The IMDb ID for 'Wonka' is 6166392.


In [95]:
# Global variables:

#1. Room profile: the aggregated profile of the full room.
room_profile = sparse.csr_matrix((1, selected_movie_vectors[0].shape[1]), dtype= float)

#2. All user profiles as a dictionary, indexed by user_id
user_profiles = {}

#3. History of all movies shown to all users:
shown_movies_history = set()

In [96]:
#3. All user's CURRENT voting activity, indexed by user_id, then by processed/unprocessed and then by movie_id
# Note that this is different from "all_user_votes" further below which is coming from the csv directly. 
# Effectively that variable can see the future. This variable here is the CURRENT voting activity.
# structure will be: UserId: <list of imdb movie ids>
user_votes_activity = {}


In [97]:
import csv

def read_user_votes(filename):
    user_votes = []
    try:
        with open(filename, newline='', encoding='utf-8') as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                # Convert row to dictionary and append to the list
                user_votes.append({'UserId': row['UserId'], 'Movie': row['Movie'], 'Vote': row['Vote']})
    except FileNotFoundError:
        print(f"The file {filename} was not found.")
    except Exception as e:
        print(f"An error occurred: {e}")
    return user_votes

filename = 'userVotes.csv'
all_user_votes = read_user_votes(filename)
print(all_user_votes)

[{'UserId': '1', 'Movie': 'Wonka', 'Vote': ' like'}, {'UserId': '2', 'Movie': 'The Marvels', 'Vote': ' dislike'}]


In [98]:
def save_user_votes(user_id, movie, vote, user_votes_activity, title_to_id_dict):
    
    if user_id not in user_votes_activity:
        user_votes_activity[user_id] = {
            'processed_movies': [],
            'processed_votes': [],
            'unprocessed_movies': [],
            'unprocessed_votes': [],
            'processed_count': 0,
            'unprocessed_count': 0,
        }
    
    #Now the user_id key will definitely exist in the dictionary. Now add the new vote.
    
    #As a principle, let's save movie_id in our internal variables. The csv can be the title for ease of testing.
    #Using strip() to remove any whitespace at the ends which can cause the movie_id to not be found
        
    movie_id = title_to_id_dict.get(movie.strip()) #imdb ID
    print(movie_id)

    user_votes_activity[user_id]['unprocessed_movies'].append(movie_id)
    user_votes_activity[user_id]['unprocessed_votes'].append(vote.strip())
    user_votes_activity[user_id]['unprocessed_count'] += 1

    print("Update vote history for this user:", user_votes_activity[user_id])

In [99]:
def update_room_profile():
    print("Dummy: Updated room profile")

In [100]:
def updated_user_profile():
    print("Dummy: Updated some user's profile")

In [101]:
def accept_user_vote(user_vote, user_votes_activity):
    user_id = user_vote["UserId"]
    movie = user_vote["Movie"]
    vote = user_vote["Vote"]
    
    # TODO: Logic on what to do every time we get a user vote goes in here

    # Every time a user votes, save the user vote
    save_user_votes(user_id, movie, vote, user_votes_activity, title_to_id_dict)

    MIN_VOTES_BEFORE_UPDATE = 5

    if user_votes_activity[user_id]['unprocessed_count'] >= MIN_VOTES_BEFORE_UPDATE:
        update_user_profile()
    
    update_room_profile()

In [103]:
#After csv stuff, test execution starts here:

for user_vote in all_user_votes:
    accept_user_vote(user_vote, user_votes_activity)

6166392
Update vote history for this user: {'processed_movies': [], 'processed_votes': [], 'unprocessed_movies': ['6166392'], 'unprocessed_votes': ['like'], 'processed_count': 0, 'unprocessed_count': 1}
Dummy: Updated room profile
10676048
Update vote history for this user: {'processed_movies': [], 'processed_votes': [], 'unprocessed_movies': ['10676048'], 'unprocessed_votes': ['dislike'], 'processed_count': 0, 'unprocessed_count': 1}
Dummy: Updated room profile


In [None]:
def update_room_profile():
    print("Dummy: Updated room profile")