In [4]:
import pandas as pd
movies = pd.read_csv("ml-25m/movies.csv") # retrieves list of over 65k movies
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [5]:
#clean data with regex
import re

#function that cleans a single title
def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title

#create a new column in dataframe called clean_title
#goes through each movie in title column and passes into clean_title function
movies["clean_title"] = movies["title"].apply(clean_title) 
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [7]:
#build search engine
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,2)) #makes search more accurate by searching in engrams
tfidf = vectorizer.fit_transform(movies["clean_title"]) #use vectorizer to turn set of titles into sets of numbers

In [9]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

#retrieves an array of 5 most similar titles
def search(title):
    title = clean_title(title) #clean title
    query_vec = vectorizer.transform([title]) #turn cleaned title into a set of numbers
    similarity = cosine_similarity(query_vec, tfidf).flatten() #returns how similar title is to each of the titles in dataset --> assigns similarity value to each title
    indices = np.argpartition(similarity, -5)[-5:] #returns an array of indices corresponding to 5 most similar titles to our search term
    results = movies.iloc[indices].iloc[::-1] #returns the list of movies using these indices
    
    return results

In [15]:
#build interactive widget that allows user to type in a movie and see search results for most similar titles
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value="",
    description="Movie Title:",
    disabled=False
)

movie_list = widgets.Output() #output widget to store results

#search set of titles and displays into output widget
def on_type(data):
    with movie_list:
        movie_list.clear_output() #remove anything already in movie list
        title = data["new"] 
        if len(title) > 3:
            display(search(title))

movie_input.observe(on_type, names='value')

display(movie_input, movie_list)

Text(value='', description='Movie Title:')

Output()

In [20]:
movie_id = 89745

ratings = pd.read_csv("ml-25m/ratings.csv")

#returns the set of users who liked the same movie (gave it a score of 5)
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()

#returns the set of movies that similar_users also liked
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

3741           318
3742           527
3743           541
3744           589
3745           741
             ...  
24998517     91542
24998518     92259
24998522     98809
24998523    102125
24998524    112852
Name: movieId, Length: 577796, dtype: int64

In [21]:
#narrows down recommendation to >10% of similar users 
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
similar_user_recs = similar_user_recs[similar_user_recs > .10]
similar_user_recs

movieId
89745    1.000000
58559    0.573393
59315    0.530649
79132    0.519715
2571     0.496687
           ...   
47610    0.103545
780      0.103380
88744    0.103048
1258     0.101226
1193     0.100895
Name: count, Length: 193, dtype: float64

In [22]:
#find the set of users who have rated a movie greater than 4 in the set of recommended movies - similar_user_recs
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
29,1,4973,4.5,1147869080
48,1,7361,5.0,1147880055
72,2,110,5.0,1141416589
76,2,260,5.0,1141417172
...,...,...,...,...
25000065,162541,5952,5.0,1240952617
25000078,162541,7153,5.0,1240952613
25000081,162541,7361,4.5,1240953484
25000086,162541,31658,4.5,1240953287


In [24]:
#finds percentage of all users who recommended movies in similar_user_recs
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [25]:
#create a recommendation score by comparing percentages of how much similar users liked the movie and how much all other users liked the movies
#want movies that have a big difference between similar and all values
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentages.columns = ["similar", "all"]
rec_percentages

Unnamed: 0_level_0,similar,all
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
89745,1.000000,0.040459
58559,0.573393,0.148256
59315,0.530649,0.054931
79132,0.519715,0.132987
2571,0.496687,0.247010
...,...,...
47610,0.103545,0.022770
780,0.103380,0.054723
88744,0.103048,0.010383
1258,0.101226,0.083887
