<a href="https://www.kaggle.com/code/dilekdd/model-based-recommendation?scriptVersionId=198971230" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

<div style="text-align: center; font-size: 40px; font-weight: bold; color: Deepskyblue;">
     Model_Based_Recommendation
</div>



In [1]:
#importing the libraries and setting up the display options
!pip install scikit-surprise
import pandas as pd
from surprise import Reader, SVD, Dataset, accuracy
from surprise.model_selection import GridSearchCV, train_test_split, cross_validate
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 500)



In [2]:
#Importing the datasets and combining them
movie = pd.read_csv('/kaggle/input/movielens-20m-dataset/movie.csv')
rating = pd.read_csv('/kaggle/input/movielens-20m-dataset/rating.csv')
df = movie.merge(rating, how="inner", on="movieId")

In [3]:
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3,4.0,1999-12-11 13:36:47
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6,5.0,1997-03-13 17:50:52
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8,4.0,1996-06-05 13:37:51
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10,4.0,1999-11-25 02:44:47
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11,4.5,2009-01-02 01:13:41


In [4]:
df.shape

(20000263, 6)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000263 entries, 0 to 20000262
Data columns (total 6 columns):
 #   Column     Dtype  
---  ------     -----  
 0   movieId    int64  
 1   title      object 
 2   genres     object 
 3   userId     int64  
 4   rating     float64
 5   timestamp  object 
dtypes: float64(1), int64(2), object(3)
memory usage: 915.5+ MB


In [6]:
df.isnull().sum().sum()

0

In [7]:
#specifying some certain movies
movie_ids = [130219, 356, 4422, 541]
movies = ["The Dark Knight (2011)",
          "Cries and Whispers (Viskningar och rop) (1972)",
          "Forrest Gump (1994)",
          "Blade Runner (1982)"]

In [8]:
#creating a sample dataframe based on the specified movies
sample_df = df[df.movieId.isin(movie_ids)]
sample_df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
2457839,356,Forrest Gump (1994),Comedy|Drama|Romance|War,4,4.0,1996-08-24 09:28:42
2457840,356,Forrest Gump (1994),Comedy|Drama|Romance|War,7,4.0,2002-01-16 19:02:55
2457841,356,Forrest Gump (1994),Comedy|Drama|Romance|War,8,5.0,1996-06-05 13:44:19
2457842,356,Forrest Gump (1994),Comedy|Drama|Romance|War,9,4.0,2001-07-01 20:26:38
2457843,356,Forrest Gump (1994),Comedy|Drama|Romance|War,10,3.0,1999-11-25 02:32:02


In [9]:
#creating the user_movie_df based on the sample_df above
user_movie_df = sample_df.pivot_table(index=["userId"],
                                      columns=["title"],
                                      values="rating")

user_movie_df.head()

title,Blade Runner (1982),Cries and Whispers (Viskningar och rop) (1972),Forrest Gump (1994),The Dark Knight (2011)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,4.0,,,
2,5.0,,,
3,5.0,,,
4,,,4.0,
7,,,4.0,


In [10]:
user_movie_df.shape

(76918, 4)

In [11]:
#specifying the rating scale for modelling
reader = Reader(rating_scale=(1, 5))

In [12]:
#preparing the data for a recommendation model by structuring it 
#in a way that the Surprise library can work with
data = Dataset.load_from_df(sample_df[['userId',
                                       'movieId',
                                       'rating']], reader)

In [13]:
#spliting the dataset in two parts, trainset(0.75) and testset(0.25)
trainset, testset = train_test_split(data, test_size=.25)

In [14]:
#establishing the model
svd_model = SVD()

In [15]:
#fitting the model into the trainset
svd_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x79f834f47010>

In [16]:
#checking out the predictions
predictions = svd_model.test(testset)

In [17]:
#error rate of the model
accuracy.rmse(predictions)

RMSE: 0.9284


0.9284244553305553

In [18]:
#selecting a sample userID from the dataset to check their rating
sample_df[sample_df["userId"] == 1]

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
3612352,541,Blade Runner (1982),Action|Sci-Fi|Thriller,1,4.0,2005-04-02 23:30:03


In [19]:
#checking the predicted value
svd_model.predict(uid=1.0, iid=541, verbose=True)

user: 1.0        item: 541        r_ui = None   est = 3.98   {'was_impossible': False}


Prediction(uid=1.0, iid=541, r_ui=None, est=3.977404583784807, details={'was_impossible': False})

In [20]:
#creating the hyperparameters' grid
param_grid = {'n_epochs': [5, 10, 15, 20],
              'lr_all': [0.0001, 0.002, 0.0005, 0.0009],
              'reg_all': [0.0001, 0.002, 0.05, 0.000002]}

print(param_grid)

{'n_epochs': [5, 10, 15, 20], 'lr_all': [0.0001, 0.002, 0.0005, 0.0009], 'reg_all': [0.0001, 0.002, 0.05, 2e-06]}


In [21]:
#performing a detailed search over specified hyperparameters to find the best model
gs = GridSearchCV(SVD,
                  param_grid,
                  measures=['rmse', 'mae'],
                  cv=3,
                  n_jobs=-1,
                  joblib_verbose=True)

In [22]:
#fitting the model
gs.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   19.5s
[Parallel(n_jobs=-1)]: Done 192 out of 192 | elapsed:  1.8min finished


In [23]:
#getting the best score after fitting the new hyperparameters
gs.best_score['rmse']

0.9298753907560218

In [24]:
#gettingt the best hyperparameters
gs.best_params['rmse']

{'n_epochs': 15, 'lr_all': 0.0001, 'reg_all': 0.05}

In [25]:
#applying the best hyperparameters to the svd_model
svd_model = SVD(**gs.best_params['rmse'])

In [26]:
#merging the train and test datasets into trainset
data = data.build_full_trainset()

In [27]:
#applying the model to whole dataset
svd_model.fit(data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x79f7dee76020>

In [28]:
#checking the best parameters
gs.best_params['rmse']

{'n_epochs': 15, 'lr_all': 0.0001, 'reg_all': 0.05}

In [29]:
#checking the new prediction value after optimizing the model
svd_model.predict(uid=1.0, iid=541, verbose=True)

user: 1.0        item: 541        r_ui = None   est = 4.18   {'was_impossible': False}


Prediction(uid=1.0, iid=541, r_ui=None, est=4.179577208416467, details={'was_impossible': False})