In [38]:
import pandas as pd

In [39]:
movies = pd.read_csv('movies.csv')
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [40]:
import re

def no_characs(title):
    return re.sub("[^a-zA-z0-9 ]", "", title)



In [41]:
movies['filter_title'] = movies["title"].apply(no_characs)
movies

Unnamed: 0,movieId,title,genres,filter_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies["filter_title"])

In [43]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np 

def search(title):
    title = no_characs(title)
    query_vec = vectorizer.transform([title])
    similar = cosine_similarity(query_vec,tfidf).flatten()
    indices = np.argpartition(similar, -5)[-5:]
    results = movies.iloc[indices][::-1]
    return results



In [44]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    description = "Enter Movie Title:",
    disabled = False
)

movie_output = widgets.Output()

def on_type(data):
    with movie_output:
        movie_output.clear_output()
        title = data['new']
        if len(title)>5:
            display(search(title))

movie_input.observe(on_type, names='value')

display(movie_input, movie_output)

Text(value='', description='Enter Movie Title:')

Output()

In [45]:
ratings = pd.read_csv("ratings.csv")
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [193]:
movie_id = 3114

In [194]:
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()

In [195]:
similar_users

array([     2,     86,    160, ..., 162508, 162519, 162530], dtype=int64)

In [196]:
similar_users_likes = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

In [197]:
similar_users_likes = similar_users_likes.value_counts() / len(similar_users)
similar_users_likes

3114      1.000000
1         0.651788
318       0.451418
260       0.440567
1196      0.431443
            ...   
2466      0.000123
170789    0.000123
165665    0.000123
161814    0.000123
92841     0.000123
Name: movieId, Length: 16829, dtype: float64

In [198]:
similar_users_likes = similar_users_likes[similar_users_likes > 0.1]
similar_users_likes[0:6].index.tolist()

  similar_users_likes[0:6].index.tolist()


[3114, 1, 318, 260, 1196, 2571]

In [199]:
movies[(movies["movieId"].isin(similar_users_likes[0:6].index.tolist()))]

  movies[(movies["movieId"].isin(similar_users_likes[0:6].index.tolist()))]


Unnamed: 0,movieId,title,genres,filter_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
257,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,Star Wars Episode IV A New Hope 1977
314,318,"Shawshank Redemption, The (1994)",Crime|Drama,Shawshank Redemption The 1994
1166,1196,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi,Star Wars Episode V The Empire Strikes Back 1980
2480,2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller,Matrix The 1999
3021,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999


In [200]:
global_users = ratings[(ratings["movieId"].isin(similar_users_likes.index)) & (ratings["rating"]>4)]
global_users

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
19,1,2692,5.0,1147869100
29,1,4973,4.5,1147869080
48,1,7361,5.0,1147880055
72,2,110,5.0,1141416589
...,...,...,...,...
25000062,162541,5618,4.5,1240953299
25000065,162541,5952,5.0,1240952617
25000078,162541,7153,5.0,1240952613
25000081,162541,7361,4.5,1240953484


In [201]:
global_users_freq = global_users["movieId"].value_counts() / len(global_users["userId"].unique())
global_users_freq

318     0.338189
296     0.281320
2571    0.241159
356     0.232494
593     0.223247
          ...   
2081    0.021354
3751    0.021170
2761    0.020680
1907    0.019796
3175    0.019417
Name: movieId, Length: 174, dtype: float64

In [202]:
rec_compare = pd.concat([similar_users_likes, global_users_freq], axis=1)
rec_compare.columns = ["similar","global"]
rec_compare

Unnamed: 0,similar,global
3114,1.000000,0.053073
1,0.651788,0.123259
318,0.451418,0.338189
260,0.440567,0.219589
1196,0.431443,0.185815
...,...,...
1079,0.101850,0.036418
91529,0.101480,0.054245
903,0.101480,0.044913
48394,0.100370,0.054441


In [203]:
rec_compare["score"] = (rec_compare["similar"] / rec_compare["global"])
rec_compare = rec_compare.sort_values(["score"], ascending=False)
rec_compare_list = rec_compare.index.to_list()
rec_compare

Unnamed: 0,similar,global,score
3114,1.000000,0.053073,18.841924
2355,0.203576,0.024796,8.210086
2761,0.142047,0.020680,6.868954
78499,0.225771,0.034717,6.503216
3751,0.132799,0.021170,6.272875
...,...,...,...
2959,0.308015,0.214164,1.438218
593,0.314673,0.223247,1.409527
79132,0.176326,0.129836,1.358062
318,0.451418,0.338189,1.334809


In [204]:
movies[(movies["movieId"].isin(rec_compare_list))].set_index("movieId").reindex(rec_compare_list).reset_index()[0:6][["movieId","title","genres"]]

Unnamed: 0,movieId,title,genres
0,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy
1,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy
2,2761,"Iron Giant, The (1999)",Adventure|Animation|Children|Drama|Sci-Fi
3,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX
4,3751,Chicken Run (2000),Animation|Children|Comedy
5,2081,"Little Mermaid, The (1989)",Animation|Children|Comedy|Musical|Romance


In [205]:
rec_compare[0:6]

Unnamed: 0,similar,global,score
3114,1.0,0.053073,18.841924
2355,0.203576,0.024796,8.210086
2761,0.142047,0.02068,6.868954
78499,0.225771,0.034717,6.503216
3751,0.132799,0.02117,6.272875
2081,0.118126,0.021354,5.531892


In [206]:
def recom_movie(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_users_likes = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

    similar_users_likes = similar_users_likes.value_counts() / len(similar_users)
    similar_users_likes = similar_users_likes[similar_users_likes > 0.1]

    global_users = ratings[(ratings["movieId"].isin(similar_users_likes.index)) & (ratings["rating"]>4)]
    global_users_freq = global_users["movieId"].value_counts() / len(global_users["userId"].unique())

    rec_compare = pd.concat([similar_users_likes, global_users_freq], axis=1)
    rec_compare.columns = ["similar","global"]

    rec_compare["score"] = (rec_compare["similar"] / rec_compare["global"])
    rec_compare = rec_compare.sort_values(["score"], ascending=False)
    rec_compare_list = rec_compare.index.to_list()

    return movies[(movies["movieId"].isin(rec_compare_list))].set_index("movieId").reindex(rec_compare_list).reset_index()[1:11][["movieId","title","genres"]]
    

In [207]:
movie_input = widgets.Text(
    description = "Enter Movie Title:",
    disabled = False
)

rec_list = widgets.Output()

def on_type(data):
    with rec_list:
        rec_list.clear_output()
        title = data['new']
        if len(title)>5:
            result = search(title)
            movie_id = result.iloc[0]["movieId"]
            display(recom_movie(movie_id))
            display(result.iloc[0]["title"])

movie_input.observe(on_type, names='value')

display(movie_input, rec_list)

Text(value='', description='Enter Movie Title:')

Output()