<a href="https://colab.research.google.com/github/elizabethzhu1/ml-movie-recommender/blob/main/movie_recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [128]:
import pandas as pd

movies = pd.read_csv("movies.csv")

In [129]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [130]:
import re

def clean_title(title):
  return re.sub("[^a-zA-Z0-9 ]", "", title)

In [131]:
movies["clean_title"] = movies["title"].apply(clean_title)

In [132]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [133]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies["clean_title"])

In [134]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
  title = clean_title(title)
  query_vec = vectorizer.transform([title])
  similarity = cosine_similarity(query_vec, tfidf).flatten()
  indices = np.argpartition(similarity, -5)[-5:]
  results = movies.iloc[indices][::-1]
  return results

In [135]:
results

Unnamed: 0,movieId,title,genres,clean_title
13512,69844,Harry Potter and the Half-Blood Prince (2009),Adventure|Fantasy|Mystery|Romance|IMAX,Harry Potter and the HalfBlood Prince 2009
4790,4896,Harry Potter and the Sorcerer's Stone (a.k.a. ...,Adventure|Children|Fantasy,Harry Potter and the Sorcerers Stone aka Harry...
5704,5816,Harry Potter and the Chamber of Secrets (2002),Adventure|Fantasy,Harry Potter and the Chamber of Secrets 2002
10408,40815,Harry Potter and the Goblet of Fire (2005),Adventure|Fantasy|Thriller|IMAX,Harry Potter and the Goblet of Fire 2005
11700,54001,Harry Potter and the Order of the Phoenix (2007),Adventure|Drama|Fantasy|IMAX,Harry Potter and the Order of the Phoenix 2007


In [162]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value = "Toy Story",
    description = "Movie Title:",
    disabled=False
)

movie_list = widgets.Output()

def on_type(data):
  with movie_list:
    movie_list.clear_output()
    title = data["new"]
    # search for the title, using our func + display it
    if len(title) > 5:
      display(search(title))

movie_input.observe(on_type, names='value')

display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [137]:
ratings = pd.read_csv("ratings.csv")

In [138]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296.0,5.0,1.147880e+09
1,1,306.0,3.5,1.147869e+09
2,1,307.0,5.0,1.147869e+09
3,1,665.0,5.0,1.147879e+09
4,1,899.0,3.5,1.147869e+09
...,...,...,...,...
1063351,7158,1356.0,5.0,8.587772e+08
1063352,7158,1358.0,5.0,8.587777e+08
1063353,7158,1363.0,3.0,8.587775e+08
1063354,7158,1366.0,3.0,8.587776e+08


In [139]:
movie_id = 1

In [140]:
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()

In [141]:
similar_users

array([  36,   75,   86,   90,   93,   95,   96,   98,  111,  120,  127,
        143,  152,  158,  160,  162,  171,  186,  188,  211,  217,  229,
        230,  235,  249,  257,  259,  297,  298,  302,  323,  329,  355,
        359,  369,  371,  381,  392,  402,  411,  428,  435,  439,  447,
        449,  468,  469,  477,  484,  513,  519,  537,  540,  541,  548,
        551,  553,  561,  567,  573,  582,  593,  607,  609,  611,  623,
        624,  626,  628,  631,  644,  653,  654,  670,  683,  686,  694,
        697,  702,  709,  727,  733,  741,  749,  752,  765,  768,  773,
        785,  791,  793,  796,  803,  805,  807,  811,  830,  834,  839,
        848,  856,  896,  904,  905,  911,  927,  947,  950,  956,  966,
        969,  986,  997, 1007, 1010, 1013, 1036, 1038, 1042, 1065, 1079,
       1092, 1096, 1101, 1118, 1123, 1131, 1138, 1140, 1141, 1143, 1146,
       1150, 1159, 1166, 1167, 1169, 1171, 1176, 1179, 1192, 1196, 1198,
       1199, 1200, 1228, 1230, 1232, 1240, 1242, 12

In [142]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

In [143]:
similar_user_recs

5101           1.0
5105          34.0
5111         110.0
5114         150.0
5127         260.0
            ...   
1062990    78499.0
1062991    79132.0
1062993    80463.0
1062994    80489.0
1062996    81932.0
Name: movieId, Length: 55383, dtype: float64

In [144]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

similar_user_recs

1.0        1.000000
318.0      0.432633
260.0      0.380717
356.0      0.341162
296.0      0.339926
             ...   
33288.0    0.001236
3854.0     0.001236
4928.0     0.001236
5938.0     0.001236
70862.0    0.001236
Name: movieId, Length: 6276, dtype: float64

In [145]:
similar_user_recs = similar_user_recs[similar_user_recs > .1]

similar_user_recs

1.0       1.000000
318.0     0.432633
260.0     0.380717
356.0     0.341162
296.0     0.339926
            ...   
2997.0    0.103832
5418.0    0.102596
1278.0    0.101360
1246.0    0.101360
778.0     0.100124
Name: movieId, Length: 102, dtype: float64

In [146]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

In [147]:
all_users

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296.0,5.0,1.147880e+09
29,1,4973.0,4.5,1.147869e+09
48,1,7361.0,5.0,1.147880e+09
72,2,110.0,5.0,1.141417e+09
76,2,260.0,5.0,1.141417e+09
...,...,...,...,...
1063012,7156,33794.0,5.0,1.514739e+09
1063016,7156,58559.0,4.5,1.514736e+09
1063050,7157,296.0,4.5,1.443945e+09
1063311,7158,608.0,5.0,8.587772e+08


In [148]:
all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [149]:
all_users_recs

318.0      0.348187
296.0      0.282931
2571.0     0.242447
356.0      0.235196
593.0      0.224773
             ...   
1580.0     0.043807
1278.0     0.039275
50872.0    0.038671
78499.0    0.036556
2355.0     0.024471
Name: movieId, Length: 102, dtype: float64

In [150]:
rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
rec_percentages.columns = ["similar", "all"]

In [151]:
rec_percentages

Unnamed: 0,similar,all
1.0,1.000000,0.122205
318.0,0.432633,0.348187
260.0,0.380717,0.211178
356.0,0.341162,0.235196
296.0,0.339926,0.282931
...,...,...
2997.0,0.103832,0.076586
5418.0,0.102596,0.061027
1278.0,0.101360,0.039275
1246.0,0.101360,0.058912


In [152]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

In [153]:
rec_percentages = rec_percentages.sort_values("score", ascending=False)

In [154]:
rec_percentages

Unnamed: 0,similar,all,score
1.0,1.000000,0.122205,8.182942
3114.0,0.265760,0.050755,5.236109
2355.0,0.112485,0.024471,4.596591
78499.0,0.154512,0.036556,4.226726
588.0,0.223733,0.067825,3.298692
...,...,...,...
296.0,0.339926,0.282931,1.201446
2858.0,0.194067,0.165106,1.175409
2329.0,0.111248,0.094713,1.174585
7361.0,0.113721,0.103021,1.103857


In [155]:
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,1.0,0.122205,8.182942,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.26576,0.050755,5.236109,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
2264,0.112485,0.024471,4.596591,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bugs Life A 1998
14813,0.154512,0.036556,4.226726,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
580,0.223733,0.067825,3.298692,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
4780,0.212608,0.067221,3.162845,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
587,0.18665,0.063595,2.934974,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991
6258,0.192831,0.067221,2.868627,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy,Finding Nemo 2003
1047,0.139679,0.049849,2.802038,1073,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical,Willy Wonka the Chocolate Factory 1971
359,0.245983,0.088671,2.774115,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,Lion King The 1994


In [189]:
def find_similar_movies(movie_id):
  similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
  similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

  similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
  similar_user_recs = similar_user_recs[similar_user_recs > .1]

  all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
  all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

  rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
  rec_percentages.columns = ["similar", "all"]
  rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
  rec_percentages = rec_percentages.sort_values("score", ascending=False)

  rec_percentages = rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

  # cleaning the genres column
  rec_percentages["genres"] = rec_percentages["genres"].str.replace('|', ', ', regex=True)

  return rec_percentages

In [190]:
movie_name_input = widgets.Text(
    value = "Toy Story",
    description = "Movie Title:",
    disabled = False
)

recommendation_list = widgets.Output()

def on_type(data):
  with recommendation_list:
    recommendation_list.clear_output()
    title = data["new"]
    if len(title) > 5:
      results = search(title)
      movie_id = results.iloc[0]["movieId"]
      display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names="value")

display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()