# Import Required Libraries

In [133]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from tabulate import tabulate
from IPython.display import display, HTML
import ipywidgets as widgets
import panel as pn

# Load Dataset

In [134]:
# Load the cleaned movies dataframe
movies_df = pd.read_csv("../data/processed-data/movies_cleaned.csv")

pd.set_option("display.max_columns", None)
movies_df.head(3)


Unnamed: 0,IMDB_ID,Title,Year,Release_Date,Release_Month,Age_Rating,Overview,Keywords,Genre,Director,Actors,Runtime,Metascore_Rating,IMDB_Rating,Rotten_Tomatoes_Rating,TMDB_Rating,Average_Rating,Won_Award,Oscar_Wins,Oscar_Nominations,Budget,Budget_Normalized,Revenue,Revenue_Normalized,Return_On_Investment,Popularity
0,tt0097499,henry v,1989,1989-10-05,October,pg-13,gritty adaption william shakespeares play engl...,['france kingdom theater play based on true st...,war,kenneth branagh,['kenneth branagh derek jacobi simon shepherd'],137,8.3,7.5,9.8,7.2,8.2,True,1,0,9000000,-0.873465,10200000,-0.801446,1.133333,18.771
1,tt1320253,the expendables,2010,2010-08-03,August,r,barney ross leads band highly skilled mercenar...,['rescue sniper island martial arts tattoo esc...,thriller,sylvester stallone,['sylvester stallone jason statham jet li'],103,4.5,6.4,4.2,6.2,5.325,False,0,0,80000000,0.317499,274470394,0.18825,3.43088,74.573
2,tt1025100,gemini man,2019,2019-10-02,October,pg-13,henry brogan elite 51 year assassin whos ready...,['hitman clone'],thriller,ang lee,['will smith mary elizabeth winstead clive owen'],117,3.8,5.7,2.7,6.3,4.625,False,0,0,140000000,1.323948,173469516,-0.189999,1.239068,27.266


# Clean the Dataset

In [None]:
# Remove brakcets and apostrophes from the Actors column
movies_df["Actors"] = movies_df["Actors"].str.replace("[", "", regex = False).str.replace("]", "", regex = False).str.replace("'", "", regex = False)
movies_df["Keywords"] = movies_df["Keywords"].str.replace("[", "", regex = False).str.replace("]", "", regex = False).str.replace("'", "", regex = False)

# Drop columns that won't be included in the cosine similarity calculation
columns_to_drop = ["IMDB_ID", "Keywords", "Won_Award", "Release_Date", "Release_Month", "Age_Rating", "Budget_Normalized", "Revenue_Normalized", "Metascore_Rating", "IMDB_Rating", "Rotten_Tomatoes_Rating", "TMDB_Rating"]
filtered_movies_df = movies_df.drop(columns = columns_to_drop)

# PReview the new dataframe
filtered_movies_df.head(3)

Unnamed: 0,Title,Year,Overview,Genre,Director,Actors,Runtime,Average_Rating,Oscar_Wins,Oscar_Nominations,Budget,Revenue,Return_On_Investment,Popularity
0,henry v,1989,gritty adaption william shakespeares play engl...,war,kenneth branagh,kenneth branagh derek jacobi simon shepherd,137,8.2,1,0,9000000,10200000,1.133333,18.771
1,the expendables,2010,barney ross leads band highly skilled mercenar...,thriller,sylvester stallone,sylvester stallone jason statham jet li,103,5.325,0,0,80000000,274470394,3.43088,74.573
2,gemini man,2019,henry brogan elite 51 year assassin whos ready...,thriller,ang lee,will smith mary elizabeth winstead clive owen,117,4.625,0,0,140000000,173469516,1.239068,27.266


## Compute TD-IDF and Cosine Similarity Scores for Text Data

In [136]:
# Combine all text features of each movie into one value of a new column
filtered_movies_df["combined_text_features"] = filtered_movies_df["Overview"] + " " + filtered_movies_df["Genre"] + " " + filtered_movies_df["Director"] + " " + filtered_movies_df["Actors"]

# Create a TF-IDF matrix to vectorize words for each movie's text features
vectorizer = TfidfVectorizer(max_features = 5000, 
                             ngram_range = (1, 2),
                             min_df = 2,
                             max_df = 0.95)
tfidf_matrix = vectorizer.fit_transform(filtered_movies_df["combined_text_features"])

# Calculate textual cosine similarity scores for each movie
text_cos_similarity = cosine_similarity(tfidf_matrix)

## Compute Cosine Similarity Scores for Numerical Data

In [137]:
# Filter the df to only include numerical columns
numerical_features = ["Runtime", "Runtime", "Average_Rating", "Oscar_Wins", "Return_On_Investment", 
                      "Budget", "Revenue", "Popularity"]

# Scale the values so that one column does not have an extreme bias towards the cosine similarity scores
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(filtered_movies_df[numerical_features])

# Calculate numerical cosine similarity scores for each movie
numerical_cos_similarity = cosine_similarity(scaled_features)

## Determine Cosine Similarity Score Weights for Each Datatype

In [158]:
# Set weights for each cosine similarity scores to determine whether text or numerical data has more say in the recommendations
text_weight = 0.25
numerical_weight = 0.75

# Create a combined cosine similarity score that uses both text and numerical features
combined_similarity = text_weight * text_cos_similarity + numerical_weight * numerical_cos_similarity


# Function to take in a movie and genreate 10 movies that are most similar to it
def recommend_movies(movie_title, top_n = 10):    
    # Obtain the index of the given movie
    selected_movie_index = filtered_movies_df[filtered_movies_df["Title"] == movie_title].index[0]

    # Obtain the similarity scores for the selected movie and place them in a list, along with each movie's index
    sim_scores = list(enumerate(combined_similarity[selected_movie_index]))

    # Sort movies based on similarity scores
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)

    # Filter the list down to n movies with the highest similiarty scores (excluding the first index/selected movie)
    sim_scores = sim_scores[0:top_n + 1]

    # Get indices of the top-n similar movies
    movie_indices = [i[0] for i in sim_scores]
    movie_scores = [i[1].round(4) for i in sim_scores]
    
    # Create a new recommended movie df with selected features of the top movies by matching the indeces of the recommended movies
    columns_to_keep = ["IMDB_ID", "Title", "Year", "Age_Rating", "Genre", "Keywords", "Director", "Actors", "Average_Rating", "Revenue", "Budget", "Oscar_Wins"]

    recommendations_df = movies_df[columns_to_keep]
    recommendations_df = recommendations_df.iloc[movie_indices]
    recommendations_df["Similarity_Score"] = movie_scores

    # Return the top-n similar movies
    return recommendations_df

# Example

In [212]:
pd.set_option("display.max_rows", None)

selected_movie = "the da vinci code"
recommendations = recommend_movies(selected_movie, 1500).reset_index(drop = True)

recommendations = recommendations[["Title", "Year", "Age_Rating", "Genre", "Director", "Average_Rating", "Similarity_Score"]].reset_index(drop = True)
recommendations.index = recommendations.index + 1

recommendations.insert(0, "Rank", recommendations.index)


recommendations.head(100)

Unnamed: 0,Rank,Title,Year,Age_Rating,Genre,Director,Average_Rating,Similarity_Score
1,1,the da vinci code,2006,pg-13,thriller,ron howard,5.1,1.0
2,2,angels demons,2009,pg-13,thriller,ron howard,5.45,0.8119
3,3,inferno,2016,pg-13,mystery,ron howard,4.7,0.8016
4,4,far and away,1992,pg-13,romance,ron howard,5.825,0.7528
5,5,2012,2009,pg-13,action,roland emmerich,5.125,0.7498
6,6,jurassic world dominion,2022,pg-13,science fiction,colin trevorrow,4.75,0.7497
7,7,the league of extraordinary gentlemen,2003,pg-13,science fiction,stephen norrington,4.1,0.7452
8,8,troy,2004,r,war,wolfgang petersen,6.35,0.7447
9,9,flyboys,2006,pg-13,war,tony bill,5.3,0.7443
10,10,transformers dark of the moon,2011,pg-13,science fiction,michael bay,5.025,0.7439


In [206]:
recommendations.to_csv("../website/movie_guesser_dfs/da_vinci_code.csv", index = False)


In [211]:
# Convert to JSON format (list of dicts)
json_data = recommendations.to_json(orient="records", indent=2)

# Save to a JSON file
with open("../website/movie_guesser_dfs/da_vinci_code.json", "w") as f:
    f.write(json_data)