# MOVIE RECOMMENDATION SYSTEM

Importing important libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import sklearn.preprocessing
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import mean_absolute_error, mean_squared_error

Reading data

In [2]:
df = pd.read_csv('movies.csv')

In [3]:
df.head()

Unnamed: 0,movieId,title,genres,Ratings
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0
1,2,Jumanji (1995),Adventure|Children|Fantasy,1
2,3,Grumpier Old Men (1995),Comedy|Romance,2
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,3
4,5,Father of the Bride Part II (1995),Comedy,1


Cleaning and pre processing data

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10329 entries, 0 to 10328
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  10329 non-null  int64 
 1   title    10329 non-null  object
 2   genres   10329 non-null  object
 3   Ratings  10329 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 322.9+ KB


In [5]:
df.describe()

Unnamed: 0,movieId,Ratings
count,10329.0,10329.0
mean,31924.282893,1.554652
std,37734.741149,1.065596
min,1.0,0.0
25%,3240.0,1.0
50%,7088.0,2.0
75%,59900.0,2.0
max,149532.0,3.0


In [6]:
df.duplicated()   

0        False
1        False
2        False
3        False
4        False
         ...  
10324    False
10325    False
10326    False
10327    False
10328    False
Length: 10329, dtype: bool

In [7]:
df.isnull().sum()  

movieId    0
title      0
genres     0
Ratings    0
dtype: int64

Converting Data into Matrix Format

In [8]:
user_item_matrix = pd.pivot_table(df,index=df.index, columns='movieId', values='Ratings', fill_value=0)
user_item_matrix  

movieId,1,2,3,4,5,6,7,8,9,10,...,144482,144656,144976,146344,146656,146684,146878,148238,148626,149532
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10324,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
10325,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2,0,0,0
10326,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10327,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,0


In [9]:
matrix = csr_matrix(user_item_matrix.values)
matrix

<10329x10329 sparse matrix of type '<class 'numpy.int64'>'
	with 8035 stored elements in Compressed Sparse Row format>

Training Model

In [10]:
model = NearestNeighbors(metric='cosine', algorithm='brute')
model.fit(matrix)                       

In [11]:
def get_movie_recommendations(movie_id, num_recommendations):           # movie recommendation function using knn algorithm
    
    distances, indices = model.kneighbors(matrix[movie_id], n_neighbors=num_recommendations+1)

    
    recommended_movie_indices = indices.squeeze()[1:]

    
    recommended_movie_titles = df.loc[recommended_movie_indices, 'title']

    return recommended_movie_titles

Testing Model

In [12]:
movie_id = 592                
number = 10
print("-----------------------------------------------------")
print("here is the list of",number,"movies for you based on movie_id",movie_id)
print("-----------------------------------------------------")
recommended_movies = get_movie_recommendations(movie_id,number)
print(recommended_movies)

-----------------------------------------------------
here is the list of 10 movies for you based on movie_id 592
-----------------------------------------------------
6889                                        Hostel (2005)
6890                                 Grandma's Boy (2006)
6883    We All Loved Each Other So Much (C'eravamo tan...
6884    Lady Vengeance (Sympathy for Lady Vengeance) (...
6885                                 49th Parallel (1941)
6886                                     Ted Bundy (2002)
6887                     District 13 (Banlieue 13) (2004)
6888                                    BloodRayne (2005)
6881       Voices of a Distant Star (Hoshi no koe) (2003)
6894                                   Hoodwinked! (2005)
Name: title, dtype: object


In [13]:
movie_id1 = 48              
no_of_recommend = 5
print("-----------------------------------------------------")
print("here is the list of",no_of_recommend,"movies for you based on movie_id",movie_id1)
print("-----------------------------------------------------")
recommended_movies = get_movie_recommendations(movie_id1,no_of_recommend)
print(recommended_movies)

-----------------------------------------------------
here is the list of 5 movies for you based on movie_id 48
-----------------------------------------------------
6886                                     Ted Bundy (2002)
6885                                 49th Parallel (1941)
6887                     District 13 (Banlieue 13) (2004)
6889                                        Hostel (2005)
6884    Lady Vengeance (Sympathy for Lady Vengeance) (...
Name: title, dtype: object


Checking Performance

In [14]:
# Assuming that the following are the actual ratings for the recommended movies by a user
actual_ratings = [0, 3, 2, 0, 1]

predicted_ratings = [df.loc[df['title'] == title, 'Ratings'].mean() for title in recommended_movies]

mae = mean_absolute_error(actual_ratings, predicted_ratings)         # Calculating the mean absolute error and root mean squared error
rmse = mean_squared_error(actual_ratings, predicted_ratings, squared=False)

print(f'Mean Absolute Error: {mae}')
print(f'Root Mean Squared Error: {rmse}')

Mean Absolute Error: 0.2
Root Mean Squared Error: 0.4472135954999579


In [15]:
accuracy = 1 - (mae / max(actual_ratings))      #checking the accuracy of the model

print(f'Accuracy: {accuracy * 100}%')

Accuracy: 93.33333333333333%
