In [1]:
import pandas as pd
movies_df = pd.read_csv('../data/raw/ml-latest-small/movies.csv')
ratings_df = pd.read_csv('../data/raw/ml-latest-small/ratings.csv')
tags_df = pd.read_csv('../data/raw/ml-latest-small/tags.csv')
links_df = pd.read_csv('../data/raw/ml-latest-small/links.csv')

In [2]:
print(movies_df.head())
print(movies_df.describe())
print(ratings_df.info())
print(links_df.info())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
             movieId
count    9742.000000
mean    42200.353623
std     52160.494854
min         1.000000
25%      3248.250000
50%      7300.000000
75%     76232.000000
max    193609.000000
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 n

In [3]:
unique_movieId_list = movies_df['genres'].unique()
print(unique_movieId_list)

['Adventure|Animation|Children|Comedy|Fantasy'
 'Adventure|Children|Fantasy' 'Comedy|Romance' 'Comedy|Drama|Romance'
 'Comedy' 'Action|Crime|Thriller' 'Adventure|Children' 'Action'
 'Action|Adventure|Thriller' 'Comedy|Horror'
 'Adventure|Animation|Children' 'Drama' 'Action|Adventure|Romance'
 'Crime|Drama' 'Drama|Romance' 'Action|Comedy|Crime|Drama|Thriller'
 'Comedy|Crime|Thriller' 'Crime|Drama|Horror|Mystery|Thriller'
 'Drama|Sci-Fi' 'Children|Drama' 'Adventure|Drama|Fantasy|Mystery|Sci-Fi'
 'Mystery|Sci-Fi|Thriller' 'Children|Comedy' 'Drama|War'
 'Action|Crime|Drama' 'Action|Adventure|Fantasy' 'Comedy|Drama|Thriller'
 'Mystery|Thriller' 'Animation|Children|Drama|Musical|Romance'
 'Crime|Mystery|Thriller' 'Adventure|Drama' 'Drama|Thriller'
 'Comedy|Crime' 'Action|Sci-Fi|Thriller' 'Action|Comedy|Horror|Thriller'
 'Comedy|Drama' 'Documentary' 'Action|Crime|Drama|Thriller'
 'Crime|Drama|Romance' 'Action|Adventure|Drama' 'Action|Thriller'
 'Drama|Horror|Thriller' 'Comedy|Horror|Romance'


In [4]:
user_movie_matrix = ratings_df.pivot(index='userId',columns='movieId',values='rating')
print(user_movie_matrix.head())

movieId  1       2       3       4       5       6       7       8       \
userId                                                                    
1           4.0     NaN     4.0     NaN     NaN     4.0     NaN     NaN   
2           NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
3           NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
4           NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
5           4.0     NaN     NaN     NaN     NaN     NaN     NaN     NaN   

movieId  9       10      ...  193565  193567  193571  193573  193579  193581  \
userId                   ...                                                   
1           NaN     NaN  ...     NaN     NaN     NaN     NaN     NaN     NaN   
2           NaN     NaN  ...     NaN     NaN     NaN     NaN     NaN     NaN   
3           NaN     NaN  ...     NaN     NaN     NaN     NaN     NaN     NaN   
4           NaN     NaN  ...     NaN     NaN     NaN     NaN     NaN     N

In [5]:
user_movie_matrix_filled = user_movie_matrix.fillna(0)
print(user_movie_matrix_filled)

movieId  1       2       3       4       5       6       7       8       \
userId                                                                    
1           4.0     0.0     4.0     0.0     0.0     4.0     0.0     0.0   
2           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
3           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
4           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
5           4.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
...         ...     ...     ...     ...     ...     ...     ...     ...   
606         2.5     0.0     0.0     0.0     0.0     0.0     2.5     0.0   
607         4.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
608         2.5     2.0     2.0     0.0     0.0     0.0     0.0     0.0   
609         3.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
610         5.0     0.0     0.0     0.0     0.0     5.0     0.0     0.0   

movieId  9       10     

In [6]:
import re
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]","",title)

In [7]:
movies_df["clean_title"] = movies_df["title"].apply(clean_title)
movies_df

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,Black Butler Book of the Atlantic 2017
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,No Game No Life Zero 2017
9739,193585,Flint (2017),Drama,Flint 2017
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,Bungo Stray Dogs Dead Apple 2018


In [8]:
# https://www.youtube.com/watch?v=eyEabQRBMQA
# Term Frequency Matrix (TF), Inverse Document Frequency (IDF)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
# ngram = groups distinct movie title words into one single entity.
vectorize = TfidfVectorizer(ngram_range=(1,2))
tfidf = vectorize.fit_transform(movies_df["clean_title"])

In [10]:
# creating search function
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


title="Men 1995"
title = clean_title(title)
search_query = vectorize.transform([title])
sim_data = cosine_similarity(search_query,tfidf).flatten()
indices = np.argpartition(sim_data,-5)[-5:]
result = movies_df.iloc[indices]

In [11]:
search_query

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 3 stored elements and shape (1, 33421)>

In [12]:
sim_data

array([0.10392147, 0.12487773, 0.57518275, ..., 0.        , 0.        ,
       0.        ])

In [13]:
result

Unnamed: 0,movieId,title,genres,clean_title
9014,140289,Men & Chicken (2015),Comedy|Drama,Men Chicken 2015
1075,1395,Tin Men (1987),Comedy|Drama,Tin Men 1987
1126,1473,Best Men (1997),Action|Comedy|Crime|Drama,Best Men 1997
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
2044,2723,Mystery Men (1999),Action|Comedy|Fantasy,Mystery Men 1999


In [14]:
# creating search function - part2
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def movie_search(title):
    title = clean_title(title)
    search_query = vectorize.transform([title])
    sim_data = cosine_similarity(search_query,tfidf).flatten()
    indices = np.argpartition(sim_data,-5)[-5:]
    result = movies_df.iloc[indices][::-1]
    return result

In [15]:
result

Unnamed: 0,movieId,title,genres,clean_title
9014,140289,Men & Chicken (2015),Comedy|Drama,Men Chicken 2015
1075,1395,Tin Men (1987),Comedy|Drama,Tin Men 1987
1126,1473,Best Men (1997),Action|Comedy|Crime|Drama,Best Men 1997
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
2044,2723,Mystery Men (1999),Action|Comedy|Fantasy,Mystery Men 1999


In [None]:
import ipywidgets as widgets
from IPython.display import display
movie_input = widgets.Text(
    value="Toy Story",
    description="Movie Title",
    disabled=False
)
movies_output=widgets.Output()
def data_type(data):
    with movies_output:
        movies_output.clear_output()
        # display(data)
        title=data["new"]
        if len(title) > 5:
            display(movie_search(title))

movie_input.observe(data_type,names='value')
display(movie_input,movies_output)

Text(value='Toy Story', description='Movie Title')

Output()

In [17]:
# Reading Movie Rating Data
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [18]:
ratings_df.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [19]:
find_movieId = 1

In [20]:
# Finding users who liked same movie
sim_users = ratings_df[(ratings_df["movieId"] == find_movieId) & (ratings_df["rating"]>4)]["userId"].unique()

In [21]:
sim_users

array([  7,  17,  31,  40,  43,  46,  57,  63,  71,  73,  96,  98, 145,
       151, 159, 166, 169, 171, 177, 201, 206, 220, 229, 234, 240, 247,
       252, 254, 269, 270, 273, 275, 280, 282, 288, 304, 328, 341, 347,
       353, 357, 364, 367, 378, 380, 382, 389, 396, 411, 438, 448, 451,
       453, 456, 460, 471, 484, 488, 533, 559, 562, 573, 584, 587, 610],
      dtype=int64)

In [22]:
sim_user_recm_movies = ratings_df[(ratings_df["userId"].isin(sim_users)) & (ratings_df["rating"]>4)]["movieId"]

In [23]:
sim_user_recm_movies

874            1
875           50
877          150
879          260
880          356
           ...  
100821    160527
100829    164179
100832    168248
100833    168250
100834    168252
Name: movieId, Length: 3754, dtype: int64

In [24]:
sim_user_recm_movies=sim_user_recm_movies.value_counts()/len(sim_users)
sim_user_recm_movies =sim_user_recm_movies[sim_user_recm_movies>.1]

In [25]:
sim_user_recm_movies

1        1.000000
318      0.430769
296      0.400000
356      0.384615
593      0.369231
           ...   
8368     0.107692
1097     0.107692
74458    0.107692
1219     0.107692
733      0.107692
Name: movieId, Length: 103, dtype: float64

In [26]:
# Find how much all users like movies

all_db_users=ratings_df[(ratings_df["movieId"].isin(sim_user_recm_movies.index)) & (ratings_df["rating"]>4)]

In [27]:
all_db_users

Unnamed: 0,userId,movieId,rating,timestamp
3,1,47,5.0,964983815
4,1,50,5.0,964982931
15,1,260,5.0,964981680
25,1,457,5.0,964981909
28,1,527,5.0,964984002
...,...,...,...,...
100227,610,51255,5.0,1479542571
100310,610,58559,4.5,1493844688
100326,610,60069,4.5,1493844866
100429,610,74458,4.5,1479542157


In [29]:
all_db_users_recm_movies = all_db_users["movieId"].value_counts()/len(all_db_users["userId"].unique())

In [30]:
all_db_users_recm_movies

318      0.362007
296      0.299283
356      0.277778
2571     0.268817
2959     0.232975
           ...   
8636     0.044803
899      0.039427
733      0.037634
78499    0.037634
500      0.030466
Name: movieId, Length: 103, dtype: float64

In [31]:
#Creating a Recommendation Score

movie_recm_percnt = pd.concat([sim_user_recm_movies,all_db_users_recm_movies],axis=1)
movie_recm_percnt.columns=["simUsers","allUsers"]

In [32]:
movie_recm_percnt

Unnamed: 0,simUsers,allUsers
1,1.000000,0.116487
318,0.430769,0.362007
296,0.400000,0.299283
356,0.384615,0.277778
593,0.369231,0.229391
...,...,...
8368,0.107692,0.055556
1097,0.107692,0.069892
74458,0.107692,0.060932
1219,0.107692,0.062724


In [33]:
movie_recm_percnt["score"] = movie_recm_percnt["simUsers"]/movie_recm_percnt["allUsers"]

In [34]:
movie_recm_percnt = movie_recm_percnt.sort_values("score",ascending=False) # highest values are first.

In [35]:
movie_recm_percnt

Unnamed: 0,simUsers,allUsers,score
1,1.000000,0.116487,8.584615
3114,0.307692,0.059140,5.202797
78499,0.184615,0.037634,4.905495
500,0.138462,0.030466,4.544796
8961,0.184615,0.057348,3.219231
...,...,...,...
110,0.184615,0.181004,1.019954
4973,0.107692,0.105735,1.018514
79132,0.123077,0.123656,0.995318
2571,0.261538,0.268817,0.972923


In [36]:
movie_recm_percnt.head(10).merge(movies_df,left_index=True,right_on="movieId")

Unnamed: 0,simUsers,allUsers,score,movieId,title,genres,clean_title
0,1.0,0.116487,8.584615,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
2355,0.307692,0.05914,5.202797,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
7355,0.184615,0.037634,4.905495,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
436,0.138462,0.030466,4.544796,500,Mrs. Doubtfire (1993),Comedy|Drama,Mrs Doubtfire 1993
5374,0.184615,0.057348,3.219231,8961,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy,Incredibles The 2004
32,0.169231,0.053763,3.147692,34,Babe (1995),Children|Drama,Babe 1995
2038,0.169231,0.053763,3.147692,2716,Ghostbusters (a.k.a. Ghost Busters) (1984),Action|Comedy|Sci-Fi,Ghostbusters aka Ghost Busters 1984
506,0.246154,0.082437,2.985953,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
592,0.107692,0.037634,2.861538,733,"Rock, The (1996)",Action|Adventure|Thriller,Rock The 1996
5260,0.123077,0.044803,2.747077,8636,Spider-Man 2 (2004),Action|Adventure|Sci-Fi|IMAX,SpiderMan 2 2004


In [37]:
# 32:09