<a href="https://colab.research.google.com/github/gerardbullock/gerardbullock.github.io/blob/main/Movie_Recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#import libraries
import numpy as np
import pandas as pd
import sklearn

In [2]:
#import data
movies = pd.read_csv('/content/movies.csv')
movies.head()


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
#remove spaces | dashes | parenthesis | characters or digits from title column
import re

def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title

In [4]:
movies["clean_title"]= movies["title"].apply(clean_title)
movies.head()

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies["clean_title"])
tfidf

<62423x170073 sparse matrix of type '<class 'numpy.float64'>'
	with 446566 stored elements in Compressed Sparse Row format>

In [6]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

#def search(title):
title = "Harry Potter"
title = clean_title(title)
query_vec = vectorizer.transform([title])
query_vec

<1x170073 sparse matrix of type '<class 'numpy.float64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [7]:
#find similarity between search terms and titles

#def search(title):
title = "Harry Potter"
title = clean_title(title)
query_vec = vectorizer.transform([title])
similarity = cosine_similarity(query_vec, tfidf).flatten()
similarity

array([0., 0., 0., ..., 0., 0., 0.])

In [8]:
#no matches to title Harry Potter 

In [13]:
title = "Pulp Fiction"
title = clean_title(title)
query_vec = vectorizer.transform([title])
similarity = cosine_similarity(query_vec, tfidf).flatten()
indicies = np.argpartition(similarity, -5)[-5:]
indicies

array([50769, 36363,   292, 62065, 12294])

In [15]:
#return top 5 most similar movies
title = "Pulp Fiction"
title = clean_title(title)
query_vec = vectorizer.transform([title])
similarity = cosine_similarity(query_vec, tfidf).flatten()
indicies = np.argpartition(similarity, -5)[-5:]
results = movies.iloc[indicies]
results

Unnamed: 0,movieId,title,genres,clean_title
50769,181745,Plump Fiction (1998),Comedy|Crime,Plump Fiction 1998
36363,150038,Science Fiction (2002),Adventure|Children|Sci-Fi|Thriller,Science Fiction 2002
292,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,Pulp Fiction 1994
62065,207700,True Fiction (2018),Thriller,True Fiction 2018
12294,59114,Pulp (1972),Comedy|Thriller,Pulp 1972


In [16]:
#top match of most similar movies
title = "Pulp Fiction"
title = clean_title(title)
query_vec = vectorizer.transform([title])
similarity = cosine_similarity(query_vec, tfidf).flatten()
indicies = np.argpartition(similarity, -5)[-5:]
results = movies.iloc[indicies][::-1]
results

Unnamed: 0,movieId,title,genres,clean_title
12294,59114,Pulp (1972),Comedy|Thriller,Pulp 1972
62065,207700,True Fiction (2018),Thriller,True Fiction 2018
292,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,Pulp Fiction 1994
36363,150038,Science Fiction (2002),Adventure|Children|Sci-Fi|Thriller,Science Fiction 2002
50769,181745,Plump Fiction (1998),Comedy|Crime,Plump Fiction 1998


In [17]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices].iloc[::-1]

In [18]:
results

Unnamed: 0,movieId,title,genres,clean_title
12294,59114,Pulp (1972),Comedy|Thriller,Pulp 1972
62065,207700,True Fiction (2018),Thriller,True Fiction 2018
292,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,Pulp Fiction 1994
36363,150038,Science Fiction (2002),Adventure|Children|Sci-Fi|Thriller,Science Fiction 2002
50769,181745,Plump Fiction (1998),Comedy|Crime,Plump Fiction 1998


In [21]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
      value = "Pulp Fiction",
      description = "Movie Title:",
      disabled=False
)
movie_input

Text(value='Pulp Fiction', description='Movie Title:')

In [26]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
      value = "Pulp Fiction",
      description = "Movie Title:",
      disabled=False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
      movie_list.clear_output()
      title = data["new"]
      if len(title) >  4:
        display(search(title))

movie_input

Text(value='Pulp Fiction', description='Movie Title:')

In [27]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 4:
            display(search(title))

movie_input.observe(on_type, names='value')


display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [28]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
      value = (title),
      description = "Movie Title:",
      disabled=False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
      movie_list.clear_output()
      title = data["new"]
      if len(title) > 4:
        display(search(title))

movie_input.observe(on_type, names='values')
display(movie_input, movie_list)

Text(value='Pulp Fiction', description='Movie Title:')

Output()

In [29]:
movie_id = 89745

#def find_similar_movies(movie_id):
movie = movies[movies["movieId"] == movie_id]

In [31]:
ratings = pd.read_csv("/content/ratings.csv")

In [99]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [32]:
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()

In [33]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

In [34]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs > .10]

In [35]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

In [36]:
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [37]:
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentages.columns = ["similar", "all"]

In [38]:
rec_percentages

Unnamed: 0,similar,all
1,0.268116,0.123741
47,0.195652,0.145177
50,0.188406,0.208509
110,0.137681,0.161741
260,0.391304,0.219877
...,...,...
134130,0.289855,0.046768
134853,0.202899,0.032803
152081,0.115942,0.018837
164179,0.130435,0.023059


In [39]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

In [40]:
rec_percentages = rec_percentages.sort_values("score", ascending=False)

In [41]:
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
17067,1.0,0.04482,22.311594,89745,"Avengers, The (2012)",Action|Adventure|Sci-Fi|IMAX,Avengers The 2012
19678,0.195652,0.012342,15.852975,102125,Iron Man 3 (2013),Action|Sci-Fi|Thriller|IMAX,Iron Man 3 2013
16312,0.144928,0.009419,15.387306,86332,Thor (2011),Action|Adventure|Drama|Fantasy|IMAX,Thor 2011
25061,0.115942,0.007795,14.874396,122900,Ant-Man (2015),Action|Adventure|Sci-Fi,AntMan 2015
16725,0.173913,0.011692,14.874396,88140,Captain America: The First Avenger (2011),Action|Adventure|Sci-Fi|Thriller|War,Captain America The First Avenger 2011
21348,0.289855,0.019812,14.630554,110102,Captain America: The Winter Soldier (2014),Action|Adventure|Sci-Fi|IMAX,Captain America The Winter Soldier 2014
25058,0.188406,0.013641,13.811939,122892,Avengers: Age of Ultron (2015),Action|Adventure|Sci-Fi,Avengers Age of Ultron 2015
21606,0.246377,0.018513,13.30867,111362,X-Men: Days of Future Past (2014),Action|Adventure|Sci-Fi,XMen Days of Future Past 2014
25071,0.224638,0.016889,13.301143,122920,Captain America: Civil War (2016),Action|Sci-Fi|Thriller,Captain America Civil War 2016
19807,0.144928,0.011692,12.39533,102903,Now You See Me (2013),Crime|Mystery|Thriller,Now You See Me 2013


In [42]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

In [44]:
rec_percentages

Unnamed: 0,similar,all,score
89745,1.000000,0.044820,22.311594
102125,0.195652,0.012342,15.852975
86332,0.144928,0.009419,15.387306
122900,0.115942,0.007795,14.874396
88140,0.173913,0.011692,14.874396
...,...,...,...
527,0.210145,0.227996,0.921704
50,0.188406,0.208509,0.903585
110,0.137681,0.161741,0.851246
296,0.231884,0.287431,0.806747


/Users/gerardbullock/Downloads/ml-25m