In [2]:
# necessary libraries for the model
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
import numpy as np
from fuzzywuzzy import process
from sklearn.metrics import mean_squared_error, mean_absolute_error



In [3]:
# import the movies dataset
movies_dataset = pd.read_csv("movies.csv")
movies_dataset.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
# remove unnecessary columns
movies = movies_dataset.drop(columns=['genres'])
movies

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)
...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017)
9738,193583,No Game No Life: Zero (2017)
9739,193585,Flint (2017)
9740,193587,Bungo Stray Dogs: Dead Apple (2018)


In [5]:
movies.shape # number of rows and columns in movies dataset

(9742, 2)

In [6]:
# import the ratings dataset
ratings_dataset = pd.read_csv("ratings.csv")
ratings_dataset.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [7]:
# remove unnecessary columns
ratings = ratings_dataset.drop(columns=['timestamp'])
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [8]:
ratings.shape # number of rows and columns in ratings dataset

(100836, 3)

In [9]:
# transform the ratings dataset into matrix
user_ratings = ratings.pivot(index = 'movieId', columns = 'userId', values = 'rating').fillna(0)
user_ratings

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
# create a csr matrix from the transformed dataset
matrix = csr_matrix(user_ratings.values)
matrix

<9724x610 sparse matrix of type '<class 'numpy.float64'>'
	with 100836 stored elements in Compressed Sparse Row format>

In [11]:
# split the data into train and test at 50%
X_train, X_test = train_test_split(matrix, test_size=0.5, random_state=0)

# Convert back to CSR if needed for certain operations
X_train_csr = csr_matrix(X_train)
X_test_csr = csr_matrix(X_test)

In [12]:
print(f"Training data shape: {X_train_csr.shape}") # number of rows and columns in the train data
print(f"Test data shape: {X_test_csr.shape}") # number of rows and columns in the test data

Training data shape: (4862, 610)
Test data shape: (4862, 610)


In [12]:
# create KNN model for the system
model = NearestNeighbors(metric = 'cosine', algorithm = 'brute', n_neighbors = 20)

In [13]:
# fit the model to the train data
model.fit(X_train_csr) 

In [14]:
# the recommender system function

def recommender(movie_name, data, n, relevant_threshold):
    
    result = process.extractOne(movie_name, movies['title'])   
    index = result[2]
    selected_movie_name = movies['title'][index]
    
    print("Movie selected:", selected_movie_name)
    print("Index:", index)
    print("Searching for recommendation...")
    
    if index >= data.shape[0]:
        print("Index out of range.")
        return
    
    distance, indices = model.kneighbors(data[index], n_neighbors = n)
    for i in indices:
            print(movies['title'][i])
               
    # Calculate relevance of recommendations
    relevant_mask = (ratings['movieId'].isin(indices.flatten())) & (ratings['rating'] >= relevant_threshold)
    
    # Calculate precision and recall
    relevant_count = relevant_mask.sum()
    precision = relevant_count / n
    total_relevant = len(ratings[ratings['movieId'] == index])
    recall = relevant_count / total_relevant

    print("Precision:", precision)
    print("Recall:", recall)
    
    # Calculate MSE and MAE
    mse = mean_squared_error([0]*n, distance[0])
    mae = np.mean(np.abs([0]*n - distance[0]))
    
    print("Mean Squared Error (MSE):", mse)
    print("Mean Absolute Error (MAE):", mae)

In [16]:
# some tests
recommender('matilda', X_test_csr, 2, 4.5)

Movie selected: Matilda (1996)
Index: 649
Searching for recommendation...
462               Scout, The (1994)
2450    White Men Can't Jump (1992)
Name: title, dtype: object
Precision: 0.5
Recall: 1.0
Mean Squared Error (MSE): 0.26823439047038394
Mean Absolute Error (MAE): 0.5176125545382146


In [21]:
recommender('matrix', X_test_csr, 5, 4.5)

Movie selected: Matrix, The (1999)
Index: 1939
Searching for recommendation...
649                    Matilda (1996)
4626              Brother Bear (2003)
1345               Chinese Box (1997)
4347                  Breakin' (1984)
287     Star Trek: Generations (1994)
Name: title, dtype: object
Precision: 1.0
Recall: 0.625
Mean Squared Error (MSE): 0.0
Mean Absolute Error (MAE): 0.0


In [23]:
recommender('godfather', X_test_csr, 5, 4.5)

Movie selected: Godfather, The (1972)
Index: 659
Searching for recommendation...
4858         Barbershop 2: Back in Business (2004)
848     Eighth Day, The (Huitième jour, Le) (1996)
3404                        No Holds Barred (1989)
4399                                Jubilee (1977)
4527             Once Upon a Time in Mexico (2003)
Name: title, dtype: object
Precision: 0.6
Recall: inf
Mean Squared Error (MSE): 0.11082838451904206
Mean Absolute Error (MAE): 0.33289103915962087


  recall = relevant_count / total_relevant


In [16]:
recommender('iron man', X_test_csr, 5, 4.5) # this movie is out of the test data (4862 movies) so it gives error

Movie selected: Iron Man (2008)
Index: 6743
Searching for recommendation...
Index out of range.


In [1]:
recommender('g', X_test_csr, 5, 4.5)

NameError: name 'recommender' is not defined