## Movie Recommedation System
Referenced by: https://www.kaggle.com/code/vikashrajluhaniwal/movie-recommendation-using-surprise-library/notebook

In [1]:
!pip install numpy pandas matplotlib seaborn scikit-surprise

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357273 sha256=57707669b113ca7701a121f6cd834f8863189a4f13290cf8d18598235f13a31d
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a40397d9bf3ff97f582cc22fb9ce66adde75bc71fd54
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.4


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from surprise import Dataset
from surprise import Reader
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.metrics import confusion_matrix, precision_score, recall_score,classification_report

In [3]:
df = pd.read_csv("ratings_small.csv")
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [4]:
df.isna().sum()

Unnamed: 0,0
userId,0
movieId,0
rating,0
timestamp,0


In [5]:
dup_bool = df.duplicated(['userId', 'movieId', 'rating'])
print(f"Num of duplicate records: {sum(dup_bool)}")

Num of duplicate records: 0


In [6]:
print("Total # of ratings :",df.shape[0])
print("# of unique users:", df["userId"].nunique())
print("# of unique movies:", df["movieId"].nunique())

Total # of ratings : 100004
# of unique users: 671
# of unique movies: 9066


In [7]:
ratings_per_user = df.groupby(by='userId')['rating'].count()
ratings_per_user.describe()
ratings_per_movie = df.groupby(by='movieId')['rating'].count()
ratings_per_movie.describe()

Unnamed: 0,rating
count,9066.0
mean,11.030664
std,24.0508
min,1.0
25%,1.0
50%,3.0
75%,9.0
max,341.0


In [8]:
reader = Reader()
ratings = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)

train_ratings, test_ratings = train_test_split(ratings, test_size=0.2, random_state=42)

print(f"Size of trainset: {train_ratings.n_ratings}")
print(f"Size of testset: {len(test_ratings)}")

Size of trainset: 80003
Size of testset: 20001


## ML Models for recommendation from Surprise library

In [9]:
## Baseline method
from surprise import BaselineOnly

In [10]:
baseline_model = BaselineOnly()
baseline_model.fit(train_ratings)

Estimating biases using als...


<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x7a0d74688ee0>

In [11]:
train_predictions = baseline_model.test(train_ratings.build_testset())
test_predictions = baseline_model.test(test_ratings)

print("Train RMSE:", accuracy.rmse(train_predictions))
print("Test RMSE:", accuracy.rmse(test_predictions))

RMSE: 0.8557
Train RMSE: 0.8556854386787012
RMSE: 0.8989
Test RMSE: 0.8989464420419919


In [12]:
movies = pd.read_csv("movies_metadata.csv")
movies.head()

  movies = pd.read_csv("movies_metadata.csv")


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [13]:
movies['genres'][0]

"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"

In [14]:
def get_top_n_recommendations(userId,predictions, n=5):
    predict_ratings = {}

    # loop for getting predictions for the user
    for uid, iid, true_r, est, _ in predictions:
        if (uid==userId):
            predict_ratings[iid] = est
    predict_ratings = sorted(predict_ratings.items(), key=lambda kv: kv[1],reverse=True)[:n]
    top_movies = [i[0] for i in predict_ratings]
    top_movies = [str(i) for i in top_movies]
    print("="*10,"Recommended movies for user {} :".format(userId),"="*10)
    print(movies[movies["id"].isin(top_movies)]["original_title"].to_string(index=False))

get_top_n_recommendations(450,test_predictions)

              Sleepless in Seattle
           The Thomas Crown Affair
                   Men in Black II
Terminator 3: Rise of the Machines


In [15]:
from surprise import KNNBasic

# KNN Basic
knn_model = KNNBasic(random_state = 42,verbose = False)
knn_model.fit(train_ratings)

<surprise.prediction_algorithms.knns.KNNBasic at 0x7a0d7468bdc0>

In [16]:
train_predictions = knn_model.test(train_ratings.build_testset())
test_predictions = knn_model.test(test_ratings)

print("RMSE on training data : ", accuracy.rmse(train_predictions, verbose = False))
print("RMSE on test data: ", accuracy.rmse(test_predictions, verbose = False))

RMSE on training data :  0.7160931907176622
RMSE on test data:  0.9662515187787728


In [20]:
param_grid = {'k': list(range(10,45,5)),
             'min_k' : list(range(5,11))}

gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse'], return_train_measures = True, cv = 5)
gs.fit(ratings)
gs.best_params['rmse']


Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

{'k': 15, 'min_k': 5}

In [17]:
tuned_knn_model = KNNBasic(k = 15, min_k= 5,random_state = 42, verbose = False)
tuned_knn_model.fit(train_ratings)
train_predictions = tuned_knn_model.test(train_ratings.build_testset())
test_predictions = tuned_knn_model.test(test_ratings)

print("RMSE on training data : ", accuracy.rmse(train_predictions, verbose = False))
print("RMSE on test data: ", accuracy.rmse(test_predictions, verbose = False))

RMSE on training data :  0.7362583595173666
RMSE on test data:  0.9620821025717659


In [18]:
knn_model_item_based = KNNBasic(user_based = False, random_state = 42)
knn_model_item_based.fit(train_ratings)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7a0d74689bd0>

In [19]:
train_predictions = knn_model_item_based.test(train_ratings.build_testset())
test_predictions = knn_model_item_based.test(test_ratings)
print("RMSE on training data : ", accuracy.rmse(train_predictions, verbose = False))
print("RMSE on test data: ", accuracy.rmse(test_predictions, verbose = False))

RMSE on training data :  0.7160931907176622
RMSE on test data:  0.9662515187787728


In [20]:
from surprise import SVD

svd_model = SVD(random_state = 42)
svd_model.fit(train_ratings)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7a0d74689de0>

In [21]:
train_predictions = svd_model.test(train_ratings.build_testset())
test_predictions = svd_model.test(test_ratings)

print("RMSE on training data : ", accuracy.rmse(train_predictions, verbose = False))
print("RMSE on test data: ", accuracy.rmse(test_predictions, verbose = False))

RMSE on training data :  0.6431314535515623
RMSE on test data:  0.9023287246946667


In [26]:
param_distributions  = {'n_factors': list(range(50,160,10)),'reg_all': np.arange(0.02,0.2,0.02),'n_epochs' : list(range(1,51))}
rs = RandomizedSearchCV(SVD, param_distributions, measures=['rmse'], return_train_measures = True, cv = 5, n_iter = 20)
rs.fit(ratings)
rs.best_params['rmse']

{'n_factors': 80, 'reg_all': 0.06, 'n_epochs': 34}

In [22]:
tuned_svd_model = SVD(n_factors=130, reg_all =0.1, n_epochs = 50, random_state = 42,verbose = False)
tuned_svd_model.fit(train_ratings)
train_predictions = tuned_svd_model.test(train_ratings.build_testset())
test_predictions = tuned_svd_model.test(test_ratings)

print("RMSE on training data : ", accuracy.rmse(train_predictions, verbose = False))
print("RMSE on test data: ", accuracy.rmse(test_predictions, verbose = False))

RMSE on training data :  0.6690748062126369
RMSE on test data:  0.8850181530285666
