# Introduction

In this file, I will do an initial exploratory analysis to understand the problem. There won't be a complete solution in this file and things may be a bit loose for the sake of speed.

In [1]:
import pandas as pd 

movies = pd.read_csv('data/movies.csv')
movies.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [2]:
movies.tail(10)

Unnamed: 0,movieId,title,genres
62413,209145,Liberté (2019),Drama
62414,209147,The Carpet of Horror (1962),Crime|Horror
62415,209151,Mao Zedong 1949 (2019),(no genres listed)
62416,209153,Happy Flight (2008),Comedy|Drama
62417,209155,Santosh Subramaniam (2008),Action|Comedy|Romance
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)
62422,209171,Women of Devil's Island (1962),Action|Adventure|Drama


In [3]:
# extra characters like parentheses in the movie titles will make it difficult to search for movies.
# I will use regular expressions to clean up the title column by removing any character that isn't a letter, digit, or a space.

import re 

def clean_title(title: str) -> str:
    return re.sub("[^a-zA-Z0-9 ]", "", title)

movies['clean_title'] = movies['title'].apply(lambda title: clean_title(title))
movies['clean_title'].head(10)

0                      Toy Story 1995
1                        Jumanji 1995
2               Grumpier Old Men 1995
3              Waiting to Exhale 1995
4    Father of the Bride Part II 1995
5                           Heat 1995
6                        Sabrina 1995
7                   Tom and Huck 1995
8                   Sudden Death 1995
9                      GoldenEye 1995
Name: clean_title, dtype: object

Since the movie titles are text-based and ML models don't work directly on words, I will use the `TfidfVectorizer` tool in python to transform the text data into numerical form that captures how important each word is.

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1, 2))

tfidf = vectorizer.fit_transform(movies['clean_title'])
print(tfidf)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 446566 stored elements and shape (62423, 170073)>
  Coords	Values
  (0, 153609)	0.4788631896261391
  (0, 138134)	0.30818287987354687
  (0, 763)	0.2947573407787223
  (0, 153617)	0.5236464902527855
  (0, 138180)	0.5609151642422612
  (1, 763)	0.3284429867728573
  (1, 76515)	0.6556226145512709
  (1, 76516)	0.679914841526996
  (2, 763)	0.22159051090518359
  (2, 61531)	0.4587178998289233
  (2, 107020)	0.2945915056134832
  (2, 93306)	0.2658829644982531
  (2, 61532)	0.4587178998289233
  (2, 107075)	0.4026827592738571
  (2, 93339)	0.4587178998289233
  (3, 763)	0.21653641961669365
  (3, 161345)	0.3375257478128795
  (3, 151795)	0.18830007825002149
  (3, 47814)	0.4482553482876627
  (3, 161363)	0.4482553482876627
  (3, 151964)	0.4482553482876627
  (3, 47815)	0.4482553482876627
  (4, 763)	0.19764049948025617
  (4, 49639)	0.2743880168654919
  (4, 104437)	0.1176070632906874
  :	:
  (62419, 165119)	0.3842738783112516
  (62419, 842)	0.2166429

With the `tfidf` matrix calculated, computing the similarity between our search term and the titles in our data becomes much more straightforward. The `scikit-learn` library has a handy function called `cosine_similarity()` to accomplish this.

In [5]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np 

# for each given search term (title), this function will create and return an NDArray whose cells are values between 0 and 1 indicating how similar the search term and the title at the corresponding location in the tfidf matrix we created previously. A value closer to 1 means they are more similar.
def search(title: str) -> np.ndarray:
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    return similarity

# We remember that movies.head() showed a few titles with '1995' string in it, so we should expect when we call search("1995") that we get values > 0 for the first few elements of the ndarray
search("1995")


array([0.29475734, 0.32844299, 0.22159051, ..., 0.        , 0.        ,
       0.        ], shape=(62423,))

In [6]:
# return the top 5 highest matching movies based on the search term
similarity = search("Toy Story 1995")
indices = np.argpartition(similarity, -5)[-5:]
top_5_movies = movies.iloc[indices]
top_5_movies

Unnamed: 0,movieId,title,genres,clean_title
20497,106022,Toy Story of Terror (2013),Animation|Children|Comedy,Toy Story of Terror 2013
14813,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
59767,201588,Toy Story 4 (2019),Adventure|Animation|Children|Comedy,Toy Story 4 2019
3021,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995


Ok, I think I want to create another search function that handles the fetching of rows from the `movies` dataframe. The new function `nsearch(title, n)` will accept a second parameter, `n`, which corresponds to the top N matches for the given title. 


In [7]:
def nsearch(title: str, n: int = 1) -> pd.DataFrame:
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -n)[-n:]
    results = movies.iloc[indices].sort_index()
    return results

# lets see if it works as expected
top_matching_movie = nsearch("Toy Story 1995")
top_matching_movie # should be the row containing 'Toy Story 1995'

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995


In [8]:
top_5_matching_movies = nsearch("Toy Story 1995", n=5)
top_5_matching_movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
14813,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
20497,106022,Toy Story of Terror (2013),Animation|Children|Comedy,Toy Story of Terror 2013
59767,201588,Toy Story 4 (2019),Adventure|Animation|Children|Comedy,Toy Story 4 2019


To make it easier to interact with the search function, I will use the `ipywidgets` library to create a search box that conveniently displays results.

In [9]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value="",
    description="Movie Title:",
    placeholder="Search for a movie...",
    disabled=False
)

n_input = widgets.BoundedIntText(
    value=5,          
    min=1, max=50,    
    step=1,
    description="Results:"
)

movie_list = widgets.Output()

# handler that reacts to either input changing
def run_search(change=None):
    with movie_list:
        movie_list.clear_output(wait=True)
        title = movie_input.value.strip()
        n = n_input.value or 5 
        if len(title) >= 3:
            display(nsearch(title, n=int(n)))

movie_input.observe(run_search, names="value")
n_input.observe(run_search, names="value")

display(widgets.VBox([widgets.HBox([movie_input, n_input]), movie_list]))

VBox(children=(HBox(children=(Text(value='', description='Movie Title:', placeholder='Search for a movie...'),…

Ok, now I want to start working on a feature that finds all users who also liked the movie that we searched for. I have a `ratings.csv` file that can help with this, so I will start by reading it in and looking at some of the columns.

In [10]:
ratings = pd.read_csv('data/ratings.csv')
ratings.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
5,1,1088,4.0,1147868495
6,1,1175,3.5,1147868826
7,1,1217,3.5,1147878326
8,1,1237,5.0,1147868839
9,1,1250,4.0,1147868414


Let's say we want to find all users who also liked the movie with `movie_id` = 1. We can just filter out the ratings dataframe and collect all unique user ids. In this case, I can just arbirtrarily say that a user liked a movie if they gave it a rating greater than or equal to 4.

In [11]:
movie_id = 1
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings['rating'] >= 4)]["userId"].unique()
similar_users

array([     3,      5,      8, ..., 162530, 162533, 162534],
      shape=(37709,))

In [12]:
# now that we found all the users who also liked the movie with id = 1, it becomes easier to find the other movies they liked.
similar_user_recs = ratings[(ratings['userId'].isin(similar_users)) & (ratings['rating'] >= 4)]['movieId']
similar_user_recs

254              1
255             29
256             32
257             50
258            111
             ...  
24999332    166643
24999342    171763
24999348    177593
24999351    177765
24999378    198609
Name: movieId, Length: 5101989, dtype: int64

In [13]:
similar_user_recs.value_counts()

movieId
1         37709
318       20725
260       20043
356       19504
296       18694
          ...  
153913        1
153917        1
6501          1
41704         1
198609        1
Name: count, Length: 30595, dtype: int64

The value count is not useful on its own, as recommendations should probably be based on some kind of threshold (). So, in this case, I will filter it down further to find only the movies that more than 10% of similar users liked.

In [14]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
similar_user_recs = similar_user_recs[similar_user_recs > .1]
similar_user_recs

movieId
1       1.000000
318     0.549604
260     0.531518
356     0.517224
296     0.495744
          ...   
235     0.101249
1242    0.100931
1907    0.100772
3527    0.100613
2761    0.100135
Name: count, Length: 273, dtype: float64

With that done, I will now move on to find movies that are specific to the niche being searched for. For example, if someone likes the `Avengers`, then the recommendation system should suggest movies that other users like that are similar to the `Avengers`, rather than all of the movies they like because otherwise we end up recommending movies that are well-liked, but completely unrelated to the search term.

In [15]:
# find all users who rated a movie highly that is in our set of recommended movies
all_users = ratings[(ratings['movieId'].isin(similar_user_recs.index)) & (ratings['rating'] > 4)]
all_users

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
19,1,2692,5.0,1147869100
23,1,3949,5.0,1147868678
29,1,4973,4.5,1147869080
37,1,6016,5.0,1147869090
...,...,...,...,...
25000065,162541,5952,5.0,1240952617
25000077,162541,7147,4.5,1240952343
25000078,162541,7153,5.0,1240952613
25000081,162541,7361,4.5,1240953484


In [16]:
# find what percentage of all users recommend each of these movies
all_user_recs = all_users['movieId'].value_counts() / len(all_users["userId"].unique())
all_user_recs

movieId
318     0.331577
296     0.275820
2571    0.236444
356     0.227949
593     0.218883
          ...   
1907    0.019409
3175    0.019037
474     0.018338
2       0.017112
440     0.016509
Name: count, Length: 273, dtype: float64

Now that the percentages are found, we can just compare them.

In [17]:
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentages.columns = ["similar", "all"]
rec_percentages

Unnamed: 0_level_0,similar,all
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.000000,0.120850
318,0.549604,0.331577
260,0.531518,0.215296
356,0.517224,0.227949
296,0.495744,0.275820
...,...,...
235,0.101249,0.022579
1242,0.100931,0.023740
1907,0.100772,0.019409
3527,0.100613,0.023984


In [18]:
# find ratio score between percentages
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
rec_percentages = rec_percentages.sort_values("score", ascending=False)
rec_percentages

Unnamed: 0_level_0,similar,all,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1.000000,0.120850,8.274754
2355,0.191095,0.024311,7.860413
648,0.187382,0.028527,6.568707
440,0.104537,0.016509,6.332170
3114,0.328914,0.052036,6.320939
...,...,...,...
858,0.355883,0.203523,1.748618
2959,0.351826,0.209977,1.675543
318,0.549604,0.331577,1.657542
79132,0.209870,0.127298,1.648656


In [19]:
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,1.0,0.12085,8.274754,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
2264,0.191095,0.024311,7.860413,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bugs Life A 1998
637,0.187382,0.028527,6.568707,648,Mission: Impossible (1996),Action|Adventure|Mystery|Thriller,Mission Impossible 1996
435,0.104537,0.016509,6.33217,440,Dave (1993),Comedy|Romance,Dave 1993
3021,0.328914,0.052036,6.320939,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
3650,0.128378,0.020756,6.184955,3751,Chicken Run (2000),Animation|Children|Comedy,Chicken Run 2000
584,0.200642,0.03244,6.184933,592,Batman (1989),Action|Crime|Thriller,Batman 1989
1,0.105598,0.017112,6.170978,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2705,0.152139,0.024863,6.119119,2797,Big (1988),Comedy|Drama|Fantasy|Romance,Big 1988
2895,0.15129,0.024882,6.08028,2987,Who Framed Roger Rabbit? (1988),Adventure|Animation|Children|Comedy|Crime|Fant...,Who Framed Roger Rabbit 1988


Okay so it looks we are getting some recommendations based on the search term. Now, I can clean it up by putting what I've done into a function.

In [20]:
def find_similar_movies(movie_id: str | int) -> pd.DataFrame:
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > 0.10]

    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]

    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

similar_movies = find_similar_movies(1)
similar_movies

Unnamed: 0,score,title,genres
0,8.017414,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3021,5.225654,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy
2264,4.405452,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy
14813,4.354038,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX
4780,3.320783,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy
580,3.208539,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical
6258,3.156862,Finding Nemo (2003),Adventure|Animation|Children|Comedy
587,2.99115,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX
8246,2.972889,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy
359,2.954762,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX


A widget like the one made earlier will make it easier to test out whether recommendations seem appropriate given the search term.

In [21]:
movie_name_input = widgets.Text(
    value="",
    description="Movie Title:",
    disabled=False
)

recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 3:
            results = nsearch(title, 10)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names="value")

display(movie_name_input, recommendation_list)

Text(value='', description='Movie Title:')

Output()