In [25]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import pandas as pd
from surprise import Reader, SVD, Dataset, accuracy
from surprise.model_selection import GridSearchCV, train_test_split, cross_validate

pd.set_option('display.max_columns', None)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/movielense20m/rating.csv
/kaggle/input/movielense20m/movie.csv


In [26]:
movies = pd.read_csv('../input/movielense20m/movie.csv', low_memory=False)
ratings = pd.read_csv('../input/movielense20m/rating.csv')
df = movies.merge(ratings, how="left", on="movieId")

In [27]:
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.0,4.0,1999-12-11 13:36:47
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6.0,5.0,1997-03-13 17:50:52
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8.0,4.0,1996-06-05 13:37:51
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10.0,4.0,1999-11-25 02:44:47
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11.0,4.5,2009-01-02 01:13:41


In [28]:
# We choose 4 films for easy review:
movie_ids = [130219, 356, 4422, 541]
movies = ["The Dark Knight (2011)",
          "Cries and Whispers (Viskningar och rop) (1972)",
          "Forrest Gump (1994)",
          "Blade Runner (1982)"]

sample_df = df[df.movieId.isin(movie_ids)]
sample_df.shape
sample_df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
2457839,356,Forrest Gump (1994),Comedy|Drama|Romance|War,4.0,4.0,1996-08-24 09:28:42
2457840,356,Forrest Gump (1994),Comedy|Drama|Romance|War,7.0,4.0,2002-01-16 19:02:55
2457841,356,Forrest Gump (1994),Comedy|Drama|Romance|War,8.0,5.0,1996-06-05 13:44:19
2457842,356,Forrest Gump (1994),Comedy|Drama|Romance|War,9.0,4.0,2001-07-01 20:26:38
2457843,356,Forrest Gump (1994),Comedy|Drama|Romance|War,10.0,3.0,1999-11-25 02:32:02


### 1. CREATING USER MOVIE DATAFRAME

In [29]:
user_movie_df = sample_df.pivot_table(index=["userId"], columns=["title"], values="rating")
user_movie_df.head()

title,Blade Runner (1982),Cries and Whispers (Viskningar och rop) (1972),Forrest Gump (1994),The Dark Knight (2011)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.0,4.0,,,
2.0,5.0,,,
3.0,5.0,,,
4.0,,,4.0,
7.0,,,4.0,


In [30]:
user_movie_df.shape

(76918, 4)

In [31]:
# Scale Rating
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(sample_df[['userId', 'movieId', 'rating']], reader)

### 2. MODELLING

In [32]:
trainset, testset = train_test_split(data, test_size=.25)
svd_model = SVD()
svd_model.fit(trainset)
predictions = svd_model.test(testset)

predictions[0:5]

[Prediction(uid=25077.0, iid=541, r_ui=5.0, est=4.474725387991851, details={'was_impossible': False}),
 Prediction(uid=52011.0, iid=356, r_ui=3.5, est=3.9139507313339172, details={'was_impossible': False}),
 Prediction(uid=54745.0, iid=356, r_ui=5.0, est=4.131768141888016, details={'was_impossible': False}),
 Prediction(uid=111459.0, iid=356, r_ui=3.0, est=4.131768141888016, details={'was_impossible': False}),
 Prediction(uid=43089.0, iid=356, r_ui=4.0, est=4.131768141888016, details={'was_impossible': False})]

In [33]:
accuracy.rmse(predictions)

RMSE: 0.9346


0.9346102940426895

In [34]:
cross_validate(svd_model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9478  0.9360  0.9291  0.9330  0.9416  0.9375  0.0066  
MAE (testset)     0.7299  0.7281  0.7200  0.7240  0.7292  0.7262  0.0038  
Fit time          6.40    6.12    6.31    6.32    6.31    6.29    0.09    
Test time         0.39    0.13    0.17    0.38    0.17    0.25    0.11    


{'test_rmse': array([0.94780158, 0.93601109, 0.92910937, 0.93303592, 0.94156657]),
 'test_mae': array([0.72990881, 0.72812331, 0.71998037, 0.72396541, 0.72923303]),
 'fit_time': (6.397135972976685,
  6.115283250808716,
  6.309149980545044,
  6.323358774185181,
  6.309827089309692),
 'test_time': (0.38672733306884766,
  0.13127589225769043,
  0.17323708534240723,
  0.38427209854125977,
  0.1674497127532959)}

In [35]:
user_movie_df.head()

title,Blade Runner (1982),Cries and Whispers (Viskningar och rop) (1972),Forrest Gump (1994),The Dark Knight (2011)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.0,4.0,,,
2.0,5.0,,,
3.0,5.0,,,
4.0,,,4.0,
7.0,,,4.0,


In [36]:
# Let's estimate Blade Runner movie rating for userid 1: 541
svd_model.predict(uid=1.0, iid=541, verbose=True)

user: 1.0        item: 541        r_ui = None   est = 4.14   {'was_impossible': False}


Prediction(uid=1.0, iid=541, r_ui=None, est=4.141477447026761, details={'was_impossible': False})

In [37]:
# Let's estimate Whispers movie rating for userid 1: 356
svd_model.predict(uid=1.0, iid=356, verbose=True)

user: 1.0        item: 356        r_ui = None   est = 4.24   {'was_impossible': False}


Prediction(uid=1.0, iid=356, r_ui=None, est=4.2448095849958225, details={'was_impossible': False})

### 3. MODEL TUNING

In [38]:
# GridSearchCV
param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3, n_jobs=-1, joblib_verbose=True)
gs.fit(data)

gs.best_score['rmse']

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:   46.5s finished


0.9319599734222092

In [39]:
gs.best_params['rmse']

{'n_epochs': 10, 'lr_all': 0.002}

### 4. FINAL MODEL AND PREDICTION

In [40]:
svd_model = SVD(**gs.best_params['rmse'])
data = data.build_full_trainset()
svd_model.fit(data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f0735fb5710>

In [41]:
# Let's estimate Blade Runner movie rating for userid 1: 541
svd_model.predict(uid=1.0, iid=541, verbose=True)

user: 1.0        item: 541        r_ui = None   est = 4.21   {'was_impossible': False}


Prediction(uid=1.0, iid=541, r_ui=None, est=4.2117158126477285, details={'was_impossible': False})

In [42]:
# Let's estimate Whispers movie rating for userid 1: 356
svd_model.predict(uid=1.0, iid=356, verbose=True)

user: 1.0        item: 356        r_ui = None   est = 4.11   {'was_impossible': False}


Prediction(uid=1.0, iid=356, r_ui=None, est=4.110415785242327, details={'was_impossible': False})