In [1]:
import pandas as pd
movies = pd.read_csv("movies.csv")

In [2]:
import re
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "", title) # looks for any character that isn't a space, number or letter

In [3]:
movies["clean_title"] = movies["title"].apply(clean_title) # cleans the title for each movie and adds it to a new column

In [4]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2)) # searches 2 words at a time
tfidf = vectorizer.fit_transform(movies["clean_title"])

In [6]:
# compute similaries
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    result = movies.iloc[indices][::-1]
    return result

In [8]:
import ipywidgets as widgets
from IPython.display import display

input = widgets.Text(
    value = "Iron Man",
    description = "Movie Title:",
    disabled = False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))

input.observe(on_type, names='value')

display(input, movie_list)

Text(value='Iron Man', description='Movie Title:')

Output()

In [9]:
ratings = pd.read_csv("ratings.csv")

In [10]:
def find_similar_movies(movie_id):
    # finding recommendations from similar watchers
    similar_watchers = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_watchers_recs = ratings[(ratings["userId"].isin(similar_watchers)) & (ratings["rating"] > 4)]["movieId"] # selects the highest rated movies

    # movies where over 10% watchers recommended
    similar_watchers_recs = similar_watchers_recs.value_counts() / len(similar_watchers)
    similar_watchers_recs = similar_watchers_recs[similar_watchers_recs > .10] # movies liked by more than 10% of similar watchers
    
    # how common the recommendation is to all users
    all_users = ratings[(ratings["movieId"].isin(similar_watchers_recs.index)) & (ratings["rating"] > 4)]
    all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

    # merging recs from similar watchers and all users
    rec_percentages = pd.concat([similar_watchers_recs, all_users_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]

    # calculating the score
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

    # sorting score
    rec_percentages = rec_percentages.sort_values("score", ascending=False)

    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

In [11]:
input_name = widgets.Text(
    value = "Iron Man",
    description = "Movie Title:",
    disabled = False
)

rec_list = widgets.Output()

def on_type(data):
    with rec_list:
        rec_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))
input_name.observe(on_type, names="value")
display(input_name, rec_list)


Text(value='Iron Man', description='Movie Title:')

Output()