# Movie Selector Based On Preferences

### Griffin Brown

This serves as a movie recommendation app given an input, it will output a list of movies to watch that are similar.

In [224]:
from flask import Flask, request, render_template
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.preprocessing import MinMaxScaler

In [225]:
# file stored in public accessible bigquery bucket
df_url = 'https://storage.googleapis.com/data-projects/top_1000_popular_movies_tmdb.csv'

df = pd.read_csv(df_url, lineterminator='\n')

df.to_csv('example.csv')

df.shape

(10001, 15)

In [226]:
df.head(n = 10)

Unnamed: 0.1,Unnamed: 0,id,title,release_date,genres,original_language,vote_average,vote_count,popularity,overview,budget,production_companies,revenue,runtime,tagline\r
0,0,385687,Fast X,5/17/2023,"['Action', 'Crime', 'Thriller']",English,7.4,1347.0,8363.473,Over many missions and against impossible odds...,340000000.0,"['Universal Pictures', 'Original Film', 'One R...",652000000.0,142.0,The end of the road begins.\r
1,1,603692,John Wick: Chapter 4,3/22/2023,"['Action', 'Thriller', 'Crime']",English,7.9,2896.0,4210.313,"With the price on his head ever increasing, Jo...",90000000.0,"['Thunder Road', '87Eleven', 'Summit Entertain...",431769200.0,170.0,"No way back, one way out.\r"
2,2,502356,The Super Mario Bros. Movie,4/5/2023,"['Animation', 'Family', 'Adventure', 'Fantasy'...",English,7.8,4628.0,3394.458,"While working underground to fix a water main,...",100000000.0,"['Universal Pictures', 'Illumination', 'Ninten...",1308767000.0,92.0,\r
3,3,569094,Spider-Man: Across the Spider-Verse,5/31/2023,"['Action', 'Adventure', 'Animation', 'Science ...",English,8.8,1160.0,2859.047,"After reuniting with Gwen Stacy, Brooklyn’s fu...",100000000.0,"['Columbia Pictures', 'Sony Pictures Animation...",313522200.0,140.0,It's how you wear the mask that matters\r
4,4,536437,Hypnotic,5/11/2023,"['Mystery', 'Thriller', 'Science Fiction']",English,6.5,154.0,2654.854,A detective becomes entangled in a mystery inv...,70000000.0,"['Studio 8', 'Solstice Productions', 'Ingeniou...",0.0,94.0,Control is an illusion.\r
5,5,667538,Transformers: Rise of the Beasts,6/6/2023,"['Action', 'Adventure', 'Science Fiction']",English,7.4,195.0,2453.807,When a new threat capable of destroying the en...,200000000.0,"['Skydance', 'Paramount', 'di Bonaventura Pict...",171045500.0,127.0,Unite or fall.\r
6,6,890771,The Black Demon,4/26/2023,"['Horror', 'Thriller']",English,6.5,151.0,1777.2,Oilman Paul Sturges' idyllic family vacation t...,0.0,"['Mucho Mas Media', 'Silk Mass', 'The Avenue E...",0.0,100.0,Nature bites back.\r
7,7,447277,The Little Mermaid,5/18/2023,"['Adventure', 'Family', 'Fantasy', 'Romance']",English,6.2,616.0,1448.64,"The youngest of King Triton’s daughters, and t...",250000000.0,"['Walt Disney Pictures', 'Lucamar Productions'...",414000000.0,135.0,"Watch and you'll see, some day I'll be, part o..."
8,8,76600,Avatar: The Way of Water,12/14/2022,"['Science Fiction', 'Adventure', 'Action']",English,7.7,8493.0,1344.884,Set more than a decade after the events of the...,460000000.0,"['20th Century Studios', 'Lightstorm Entertain...",2320250000.0,192.0,Return to Pandora.\r
9,9,713704,Evil Dead Rise,4/12/2023,"['Thriller', 'Horror']",English,7.1,1597.0,1285.781,Three siblings find an ancient vinyl that give...,15000000.0,"['New Line Cinema', 'Ghost House Pictures', 'R...",141512100.0,96.0,Mommy loves you to death.\r


In [227]:
# unnecessary first column
df = df.drop(df.columns[0], axis=1)

test_movie = 'Transformers: Rise of the Beasts'

### Overview Recommendation

In [228]:
tfidf = TfidfVectorizer(stop_words='english')

df['overview'] = df['overview'].fillna('')

tfidf_matrix = tfidf.fit_transform(df['overview'])

tfidf_matrix.shape

(10001, 28068)

In [229]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [230]:
def get_recommendations(title, 
                        cosine_sim=cosine_sim):
    idx = df[df['title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return df['title'].iloc[movie_indices]

In [231]:
get_recommendations(test_movie)

64            Transformers: The Last Knight
676          Transformers: Dark of the Moon
49          Transformers: Age of Extinction
78                 Transformers: Beginnings
343                        Transformers One
4327                     Ben 10 Alien Swarm
1689     Nausicaä of the Valley of the Wind
394                 Avengers: Age of Ultron
305                       Godzilla vs. Kong
454     Transformers: Revenge of the Fallen
Name: title, dtype: object

In [232]:
scaler = MinMaxScaler()
df[['vote_average', 'vote_count', 'popularity']] = scaler.fit_transform(df[['vote_average', 'vote_count', 'popularity']])

df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
current_date = pd.to_datetime('today')
df['days_since_release'] = (current_date - df['release_date']).dt.days
df['time_decay'] = np.exp(-df['days_since_release'] / 365)

In [233]:
def get_weighted_similarity(idx, 
                            cosine_sim=cosine_sim, 
                            df=df, 
                            weight_votes=0.02, 
                            weight_popularity=0.02):
    sim_scores = list(enumerate(cosine_sim[idx]))
    vote_avg_scores = df['vote_average'].values
    vote_count_scores = df['vote_count'].values
    popularity_scores = df['popularity'].values
    
    # Calculate the weighted score
    weighted_scores = []
    for i, score in sim_scores:
        weighted_score = score + \
                        weight_votes * vote_avg_scores[i] + \
                        weight_votes * vote_count_scores[i] + \
                        weight_popularity * popularity_scores[i]
        weighted_scores.append((i, weighted_score))
    
    return weighted_scores

In [234]:
def get_recommendations(title, 
                        cosine_sim=cosine_sim, 
                        df=df, 
                        weight_votes=0.02, 
                        weight_popularity=0.02):
    idx = df[df['title'] == title].index[0]
    # adding in similarity by scores
    sim_scores = get_weighted_similarity(idx, 
                                         cosine_sim, 
                                         df, 
                                         weight_votes, 
                                         weight_popularity,)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return df['title'].iloc[movie_indices]

In [235]:
get_recommendations(test_movie)

64           Transformers: The Last Knight
676         Transformers: Dark of the Moon
49         Transformers: Age of Extinction
78                Transformers: Beginnings
305                      Godzilla vs. Kong
1689    Nausicaä of the Valley of the Wind
172                 Spider-Man: Homecoming
4327                    Ben 10 Alien Swarm
8935           Transformers: Titans Return
12                                My Fault
Name: title, dtype: object