In [1]:
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import seaborn as sns

from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings("ignore")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/movielens-dataset/movies.csv

/kaggle/input/movielens-dataset/ratings.csv


- **In movies.csv:**
    - movieId: The ID of the movie.	
    - title: The title of the movie
    - genres: Movie genres.

- **In ratings.csv:**
    - userId: The ID of the use.
    - movieId: The ID of the movie.
    - rating: The rating the user gave the movie.	
    - timestamp: The time the movie was rated.

In [2]:
movies_df = pd.read_csv('/kaggle/input/movielens-dataset/movies.csv')
print(f"Shape of movies dataset: {movies_df.shape}")
movies_df.head()

Shape of movies dataset: (10329, 3)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
print(f"Null values in movies dataset:\n{movies_df.isnull().sum()}")
print("-" * 15)
movies_df.info() 

Null values in movies dataset:

movieId    0

title      0

genres     0

dtype: int64

---------------

<class 'pandas.core.frame.DataFrame'>

RangeIndex: 10329 entries, 0 to 10328

Data columns (total 3 columns):

 #   Column   Non-Null Count  Dtype 

---  ------   --------------  ----- 

 0   movieId  10329 non-null  int64 

 1   title    10329 non-null  object

 2   genres   10329 non-null  object

dtypes: int64(1), object(2)

memory usage: 242.2+ KB


In [4]:
ratings_df = pd.read_csv('/kaggle/input/movielens-dataset/ratings.csv')
print(f"Shape of ratings dataset: {ratings_df.shape}")
ratings_df.head()

Shape of ratings dataset: (105339, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523


In [5]:
print(f"Null values in movies dataset:\n{ratings_df.isnull().sum()}")
print("-" * 15)
ratings_df.info() 

Null values in movies dataset:

userId       0

movieId      0

rating       0

timestamp    0

dtype: int64

---------------

<class 'pandas.core.frame.DataFrame'>

RangeIndex: 105339 entries, 0 to 105338

Data columns (total 4 columns):

 #   Column     Non-Null Count   Dtype  

---  ------     --------------   -----  

 0   userId     105339 non-null  int64  

 1   movieId    105339 non-null  int64  

 2   rating     105339 non-null  float64

 3   timestamp  105339 non-null  int64  

dtypes: float64(1), int64(3)

memory usage: 3.2 MB


In [6]:
movies_with_year = movies_df.copy()

movies_with_year['year'] = movies_df['title'].str.extract("(\(\d\d\d\d\))", expand = True)
movies_with_year['year'] = movies_with_year['year'].str.extract("(\d\d\d\d)", expand = True)
movies_with_year['year'] = pd.to_numeric(movies_with_year['year'], downcast = 'float' )

movies_with_year['title'] = movies_with_year['title'].str.replace("(\(\d\d\d\d\))", "")
movies_with_year['title'] = movies_with_year['title'].str.strip()

movies_with_year['genres'] = movies_with_year['genres'].str.split("|")

movies_with_year.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995.0
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995.0
4,5,Father of the Bride Part II,[Comedy],1995.0


In [7]:
movies_with_year_genres = movies_with_year.copy()
for index, row in movies_with_year_genres.iterrows():
    for genres in row['genres']:
        movies_with_year_genres.at[index, genres] = 1.0   
        
movies_with_year_genres.fillna(0.0, inplace = True)        
movies_with_year_genres.head()      

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,War,Musical,Documentary,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,[Comedy],1995.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
genres_table = movies_with_year_genres.drop(columns = ['title', 'genres', 'year'])
genres_table.set_index('movieId', inplace = True)
genres_list = genres_table.columns
genres_table.head()

Unnamed: 0_level_0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,War,Musical,Documentary,Western,Film-Noir,(no genres listed)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
genres_weighted_with_ratings = ratings_df.merge(genres_table, on = 'movieId', copy = False)
genres_weighted_with_ratings.drop(columns = 'timestamp', inplace = True)

genres_weighted_with_ratings[list(genres_list)] = genres_weighted_with_ratings[list(genres_list)].multiply(genres_weighted_with_ratings['rating'],
                                                                                                           axis = 0)
genres_weighted_with_ratings.head()

Unnamed: 0,userId,movieId,rating,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,War,Musical,Documentary,Western,Film-Noir,(no genres listed)
0,1,16,4.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,9,16,4.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,12,16,1.5,0.0,0.0,0.0,0.0,0.0,0.0,1.5,0.0,1.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,24,16,4.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,29,16,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
users_profile = genres_weighted_with_ratings.groupby(by = ['userId']).mean()
users_profile.drop(columns = 'movieId', inplace = True)
print(f"Number of users(profile): {users_profile.shape[0]}")
users_profile.head()

Number of users(profile): 668


Unnamed: 0_level_0,rating,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,War,Musical,Documentary,Western,Film-Noir,(no genres listed)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.628319,1.013274,0.070796,0.084071,0.89823,0.269912,0.473451,1.530973,1.557522,1.154867,1.473451,0.261062,0.438053,0.752212,0.137168,0.371681,0.035398,0.004425,0.030973,0.070796,0.0
2,3.896552,1.37931,0.310345,0.448276,1.344828,0.586207,1.034483,1.655172,1.206897,0.448276,1.62069,0.0,0.241379,0.655172,0.0,0.0,0.275862,0.0,0.0,0.0,0.0
3,3.794521,0.424658,0.09589,0.260274,1.794521,0.232877,1.09589,1.958904,0.575342,0.657534,1.068493,0.109589,0.232877,0.136986,0.109589,0.150685,0.150685,0.068493,0.164384,0.0,0.0
4,4.16129,0.580645,0.153226,0.209677,1.491935,0.274194,1.241935,2.66129,0.427419,0.572581,0.540323,0.169355,0.354839,0.104839,0.0,0.58871,0.209677,0.0,0.169355,0.209677,0.0
5,3.183824,1.147059,1.264706,1.205882,2.279412,0.904412,1.154412,0.75,0.691176,0.227941,0.286765,0.176471,0.117647,0.323529,0.566176,0.014706,0.661765,0.0,0.0,0.0,0.0


## Content-based Filtering <a id = "4"></a>


In [11]:
def movie_rating(movies, ratings):
    input_movies = pd.DataFrame({
        'title': movies,
        'rating': ratings
        })
    
    return input_movies

def input_movie_rating(movies, ratings):
    movie_rating_df = movie_rating(movies, ratings)
    
    idmovie_rating_df = movies_with_year_genres[movies_with_year_genres['title'].isin(movies)]
    user_movie_genres = movie_rating_df.merge(idmovie_rating_df, on = 'title')
    user_movie_genres.drop(columns = ['genres', 'year'], inplace = True)
    user_movie_genres.set_index('movieId', inplace = True)
    
    return user_movie_genres

In [12]:
my_rating = input_movie_rating(['Rio Bravo', 'Vertigo', 'Modern Times', 
                                'To Be or Not to Be', 'Some Like It Hot'], 
                               [5.0, 5.0, 5.0, 5.0, 5.0])
my_rating

Unnamed: 0_level_0,title,rating,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,War,Musical,Documentary,Western,Film-Noir,(no genres listed)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
4329,Rio Bravo,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
903,Vertigo,5.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3462,Modern Times,5.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
946,To Be or Not to Be,5.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
26491,To Be or Not to Be,5.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
910,Some Like It Hot,5.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
my_rating_by_genres = my_rating.copy()
my_rating_by_genres.drop(columns = ['title', 'rating'], inplace = True)
my_rating_by_genres = my_rating_by_genres.multiply(my_rating['rating'], axis = 'index')
my_profile = my_rating_by_genres.mean()
my_profile

Adventure             0.000000
Animation             0.000000
Children              0.000000
Comedy                3.333333
Fantasy               0.000000
Romance               2.500000
Drama                 2.500000
Action                0.000000
Crime                 0.833333
Thriller              0.833333
Horror                0.000000
Mystery               0.833333
Sci-Fi                0.000000
IMAX                  0.000000
War                   1.666667
Musical               0.000000
Documentary           0.000000
Western               0.833333
Film-Noir             0.000000
(no genres listed)    0.000000
dtype: float64

In [14]:
id_movies_recom = genres_table.multiply(my_profile, axis = 1).sum(axis = 1).sort_values(ascending = False)
id_movies_recom.keys()
movies_df[movies_df['movieId'].isin(id_movies_recom.head(10).keys().to_list())]

Unnamed: 0,movieId,title,genres
1484,1912,Out of Sight (1998),Comedy|Crime|Drama|Romance|Thriller
2399,3003,Train of Life (Train de vie) (1998),Comedy|Drama|Romance|War
3068,3893,Nurse Betty (2000),Comedy|Crime|Drama|Romance|Thriller
3696,4719,Osmosis Jones (2001),Action|Animation|Comedy|Crime|Drama|Romance|Th...
5052,6954,"Barbarian Invasions, The (Les invasions barbar...",Comedy|Crime|Drama|Mystery|Romance
5478,7831,Another Thin Man (1939),Comedy|Crime|Drama|Mystery|Romance
5482,7835,Song of the Thin Man (1947),Comedy|Crime|Drama|Musical|Mystery|Romance
8469,75408,Lupin III: Sweet Lost Night (Rupan Sansei: Swe...,Action|Animation|Comedy|Crime|Drama|Mystery|Ro...
8492,76153,Lupin III: First Contact (Rupan Sansei: Faasut...,Action|Animation|Comedy|Crime|Drama|Mystery|Ro...
10090,116799,Inherent Vice (2014),Comedy|Crime|Drama|Mystery|Romance


## Collaborative Filtering <a id = "5"></a>


In [15]:
def input_movie_rating_without_genres(movies, ratings):
    movie_rating_df = movie_rating(movies, ratings)
    
    idmovie_rating_df = movies_with_year_genres[movies_with_year_genres['title'].isin(movies)]
    user_movie_genres = movie_rating_df.merge(idmovie_rating_df, on = 'title')
    user_movie_genres.drop(columns = ['genres', 'year'], inplace = True)
    user_movie_genres.drop(columns = genres_list.to_list(), inplace = True)
            
    return user_movie_genres

In [16]:
my_new_rating = input_movie_rating_without_genres(['Rio Bravo', 'Vertigo', 'Modern Times',
                                                   'To Be or Not to Be', 'Some Like It Hot'], 
                                                  [5.0, 4.5, 4.0, 4.5, 4.0])
my_new_rating

Unnamed: 0,title,rating,movieId
0,Rio Bravo,5.0,4329
1,Vertigo,4.5,903
2,Modern Times,4.0,3462
3,To Be or Not to Be,4.5,946
4,To Be or Not to Be,4.5,26491
5,Some Like It Hot,4.0,910


In [17]:
rating_without_time = ratings_df.drop(columns = ['timestamp'])

users_with_shared_movies = rating_without_time[rating_without_time['movieId'].isin(my_new_rating['movieId'])]
user_groups_with_shared_movies = users_with_shared_movies.groupby('userId')
user_groups_with_shared_movies = sorted(user_groups_with_shared_movies,
                                        key = lambda x: len(x[1]), reverse = True)
user_groups_with_shared_movies[0] #101 users

(668,
         userId  movieId  rating
 99988      668      903     4.0
 99995      668      910     5.0
 100024     668      946     4.0
 101108     668     3462     4.0
 101425     668     4329     4.0
 102913     668    26491     2.0)

<h3 align="left"> Pearson Correlation <h3/>


In [18]:
pearsonCorrelationDict = {}

for name, group in user_groups_with_shared_movies:
    
    group = group.sort_values(by='movieId')
    my_new_rating = my_new_rating.sort_values(by='movieId')
    
    nRatings = len(group)
  
    temp_df = my_new_rating[my_new_rating['movieId'].isin(group['movieId'].tolist())]
    tempRatingList = temp_df['rating'].tolist()
    tempGroupList = group['rating'].tolist()

    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
    

    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/np.sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0      

In [19]:
pearson_df = pd.DataFrame.from_dict(pearsonCorrelationDict, orient = 'index')
pearson_df.columns = ['similarity index']
pearson_df['userId'] = pearson_df.index
pearson_df.index = range(len(pearson_df))
top_users = pearson_df.sort_values(by = 'similarity index', ascending = False)[0:10]
top_users

Unnamed: 0,similarity index,userId
19,1.0,255
23,1.0,352
18,1.0,224
24,1.0,358
1,0.870388,213
5,0.866025,244
9,0.866025,530
6,0.5,275
11,0.5,615
65,0.0,313


In [20]:
top_users_rating = top_users.merge(ratings_df, on = 'userId')
top_users_rating['weighted rating'] = top_users_rating['similarity index'] * top_users_rating['rating']
top_users_rating.head()

Unnamed: 0,similarity index,userId,movieId,rating,timestamp,weighted rating
0,1.0,255,1,4.0,1174414865,4.0
1,1.0,255,17,4.5,1174416318,4.5
2,1.0,255,32,3.5,1174415405,3.5
3,1.0,255,34,4.0,1174416304,4.0
4,1.0,255,39,4.0,1174416299,4.0


In [21]:
recom_movie_id = top_users_rating.groupby(by = 'movieId').mean()[[
    'weighted rating', 'similarity index']].sort_values(by = 'weighted rating', ascending = False)
recom_movie_id['movieId'] = recom_movie_id.index
recom_movie_id.index = range(len(recom_movie_id))

recom_movie_id.head(10)

Unnamed: 0,weighted rating,similarity index,movieId
0,5.0,1.0,134853
1,5.0,1.0,96606
2,5.0,1.0,4865
3,5.0,1.0,4783
4,5.0,1.0,71899
5,5.0,1.0,55765
6,5.0,1.0,55820
7,5.0,1.0,81845
8,5.0,1.0,4644
9,5.0,1.0,96728


In [22]:
movies_df[movies_df['movieId'].isin(recom_movie_id['movieId'].head(10))]

Unnamed: 0,movieId,title,genres
3638,4644,Bread and Tulips (Pane e tulipani) (2000),Comedy|Drama|Romance
3740,4783,Endurance: Shackleton's Legendary Antarctic Ex...,Documentary
3795,4865,From Hell (2001),Crime|Horror|Mystery|Thriller
7537,55765,American Gangster (2007),Crime|Drama|Thriller
7541,55820,No Country for Old Men (2007),Crime|Drama
8283,71899,Mary and Max (2009),Animation|Comedy|Drama
8778,81845,"King's Speech, The (2010)",Drama
9436,96606,Samsara (2011),Documentary
9446,96728,"Master, The (2012)",Drama
10255,134853,Inside Out (2015),Animation|Children|Comedy
