## Importing all libraries required

In [1]:
import pandas as pd
import numpy as np
from scipy.sparse.linalg import svds
from surprise import Reader, Dataset, accuracy, SVD
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split

base_path = 'data/'

In [2]:
movie_cols = ['id', 'title']
movies = pd.read_table(base_path+'movies.dat', encoding="ISO-8859-1", usecols=movie_cols)
#movies = movies[['id', 'title']]
print(movies)

          id                        title
0          1                    Toy story
1          2                      Jumanji
2          3               Grumpy Old Men
3          4            Waiting to Exhale
4          5  Father of the Bride Part II
...      ...                          ...
10192  65088              Bedtime Stories
10193  65091          Manhattan Melodrama
10194  65126                        Choke
10195  65130           Revolutionary Road
10196  65133      Blackadder Back & Forth

[10197 rows x 2 columns]


In [3]:
rating_cols = ['userID',  'movieID',  'rating']
ratings = pd.read_table(base_path+'user_ratedmovies.dat', encoding="ISO-8859-1", usecols=rating_cols)
print(ratings)

        userID  movieID  rating
0           75        3     1.0
1           75       32     4.5
2           75      110     4.0
3           75      160     2.0
4           75      163     4.0
...        ...      ...     ...
855593   71534    44555     4.0
855594   71534    46578     4.0
855595   71534    48516     4.5
855596   71534    61075     5.0
855597   71534    62049     4.5

[855598 rows x 3 columns]


In [4]:
user_cols = ['userID',  'movieID',  'tagID']
users = pd.read_table(base_path+'user_taggedmovies.dat', encoding="ISO-8859-1", usecols=user_cols)
print(users)

       userID  movieID  tagID
0          75      353   5290
1          78     4223   5264
2         127     1343   1544
3         127     1343  12330
4         127     2080   1451
...       ...      ...    ...
47952   71534     7937    306
47953   71534     8848    331
47954   71534     8848    427
47955   71534    25833   7671
47956   71534    30701    243

[47957 rows x 3 columns]


###### Ratings matrix: One row per user

In [5]:
ratings_pivot = ratings.pivot(index = 'userID', columns ='movieID', values = 'rating').fillna(0)
ratings_pivot.head()

movieID,1,2,3,4,5,6,7,8,9,10,...,64997,64999,65006,65011,65037,65088,65091,65126,65130,65133
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
75,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
78,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
127,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
170,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
175,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
ratings_pivot[1]

userID
75       0.0
78       0.0
127      0.0
170      3.0
175      4.0
        ... 
71497    5.0
71509    4.0
71525    0.0
71529    4.5
71534    0.0
Name: 1, Length: 2113, dtype: float64

In [7]:
ratings_pivot[75].value_counts()

0.0    2102
1.0       4
1.5       2
2.0       2
2.5       1
0.5       1
4.0       1
Name: 75, dtype: int64

In [8]:
users_count = ratings.userID.nunique()
movies_count = ratings.movieID.nunique()

sparsity = round(1.0 - len(ratings) / float(users_count * movies_count), 3)
print('Sparsity level of dataset is ' +  str(sparsity * 100) + '%')

Sparsity level of dataset is 96.0%


###### Normalizing rating matrix and converting it into numpy

In [9]:
rating_matrix = ratings_pivot.values
user_ratings_mean = np.mean(rating_matrix, axis = 1)
normalized_ratings = rating_matrix - user_ratings_mean.reshape(-1, 1)
print(normalized_ratings)

[[-0.01884459 -0.01884459  0.98115541 ... -0.01884459 -0.01884459
  -0.01884459]
 [-0.19082006 -0.19082006 -0.19082006 ... -0.19082006 -0.19082006
  -0.19082006]
 [-0.01093085 -0.01093085 -0.01093085 ... -0.01093085 -0.01093085
  -0.01093085]
 ...
 [-0.16282521 -0.16282521 -0.16282521 ... -0.16282521 -0.16282521
  -0.16282521]
 [ 4.4661193  -0.0338807   1.9661193  ... -0.0338807  -0.0338807
  -0.0338807 ]
 [-0.06988822 -0.06988822 -0.06988822 ... -0.06988822 -0.06988822
  -0.06988822]]


###### Applying singular value decomposition on our normalized rating matrix and then coverting it to diagonal matrix

In [10]:
U, sigma, Vt = svds(normalized_ratings, k = 50)
sigma = np.diag(sigma)


###### Lets do the movie ratings predictions now

In [11]:
pred_ratings_scores = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
pred_ratings = pd.DataFrame(pred_ratings_scores, columns = ratings_pivot.columns)
pred_ratings.head()

movieID,1,2,3,4,5,6,7,8,9,10,...,64997,64999,65006,65011,65037,65088,65091,65126,65130,65133
0,0.085647,0.213415,0.110423,0.016728,-0.049234,0.395392,0.097255,0.073124,0.081229,0.359853,...,-0.002469,-0.003598,-0.002559,2.7e-05,-0.006142,-0.002608,0.0002,0.003159,-0.007042,0.001275
1,2.370667,0.294171,-0.675386,-0.081445,-0.566633,0.252062,-0.584425,-0.036364,-0.127027,-0.17148,...,-0.04268,0.00712,0.032698,0.009389,0.017942,0.029469,-0.045188,0.027863,-0.002353,0.039496
2,0.010428,0.047469,0.015976,0.00449,0.068292,-0.037154,0.001788,0.001419,-0.006662,-0.058778,...,-0.000904,0.001098,0.002294,-0.001312,0.001491,0.002138,0.00178,0.003912,0.000879,-0.001528
3,1.181997,0.766787,0.264725,0.046024,0.029374,0.315996,-0.09006,-0.02711,-0.070596,0.705608,...,-0.007663,-0.003119,-0.009331,-0.006964,0.00169,-0.008547,-0.001086,0.000242,-0.009939,0.017452
4,3.224798,0.821154,0.232838,0.004666,0.196803,1.768103,0.220659,0.078117,-0.086389,1.418806,...,-0.029763,-0.01079,-0.03033,-0.038571,-0.006996,-0.02799,-0.044882,-0.007199,-0.003874,-0.030449


###### top_recommended_movies is a function which return movies with the highest predicted rating that the specified user has now rated yet

In [12]:
def top_recommended_movies(predictions, userID, movies, original_ratings, num_recommendations):
    user_row_number = userID - 1 
    sorted_user_predictions = pred_ratings.iloc[user_row_number].sort_values(ascending=False)
    
    user_data = original_ratings[original_ratings.userID == (userID)]
    user_full = (user_data.merge(movies, how = 'left', left_on = 'movieID', right_on = 'id').sort_values(['rating'], ascending=False))

    print('User {0} has already rated {1} movies.'.format(userID, user_full.shape[0]))
    print('Recommending highest {0} predicted ratings movies those are not already rated.'.format(num_recommendations))
    
    recommendations = (movies[~movies['id'].isin(user_full['movieID'])].
         merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',
               left_on = 'id',
               right_on = 'movieID').
         rename(columns = {user_row_number: 'Predictions'}).
         sort_values('Predictions', ascending = False).
                       iloc[:num_recommendations, :-1]
                      )

    return user_full, recommendations

In [13]:
rated, predicted = top_recommended_movies(pred_ratings, 75, movies, ratings, 20)


User 75 has already rated 55 movies.
Recommending highest 20 predicted ratings movies those are not already rated.


In [14]:
rated.head(20)


Unnamed: 0,userID,movieID,rating,id,title
50,75,32587,5.0,32587,Sin City
7,75,296,5.0,296,Pulp Fiction
54,75,45722,4.5,45722,Pirates of the Caribbean: Dead Man's Chest
29,75,2571,4.5,2571,The Matrix
17,75,1215,4.5,1215,Army of Darkness
23,75,1527,4.5,1527,The Fifth Element
1,75,32,4.5,32,Twelve Monkeys
14,75,996,4.5,996,Last Man Standing
12,75,832,4.5,832,Ransom
32,75,2700,4.5,2700,South Park: Bigger Longer & Uncut


In [15]:
predicted

Unnamed: 0,id,title,movieID
3968,4306,Shrek,4306.0
6444,6874,Kill Bill: Vol. 2,6874.0
6118,6539,Pirates of the Caribbean: The Curse of the Bla...,6539.0
6970,7438,Kill Bill: Vol. 2,7438.0
334,356,Forrest Gump,356.0
4537,4886,"Monsters, Inc.",4886.0
5964,6377,Finding Nemo,6377.0
1019,1136,Monty Python and the Holy Grail,1136.0
5080,5445,Minority Report,5445.0
1396,1580,Men in Black,1580.0


###### Lets evaluate RMSE (Root Mean Squared Error) on our model SVD with our data.

In [16]:
reader = Reader()
data = Dataset.load_from_df(ratings[['userID', 'movieID', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=.25)

svd = SVD()
cross_validate(svd, data, measures=['RMSE'], cv=5, verbose=True)

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.7692  0.7708  0.7689  0.7728  0.7713  0.7706  0.0014  
Fit time          7.43    7.91    8.00    7.98    7.97    7.86    0.22    
Test time         1.91    1.31    1.61    1.84    1.32    1.60    0.25    


{'test_rmse': array([0.76915585, 0.77081366, 0.7689457 , 0.77278645, 0.771262  ]),
 'fit_time': (7.432273864746094,
  7.913349866867065,
  7.9979259967803955,
  7.980731010437012,
  7.968905925750732),
 'test_time': (1.9062771797180176,
  1.3143699169158936,
  1.611341953277588,
  1.8391449451446533,
  1.31842041015625)}

###### RMSE for test data

In [17]:
svd.fit(trainset)
predictions = svd.test(testset)
acc = accuracy.rmse(predictions)
print(acc)

RMSE: 0.7720
0.7719631170893145


###### Lets predict the rating for User no. 1310 will give to movie no. 1694

In [18]:
svd.predict(1310, 1694)


Prediction(uid=1310, iid=1694, r_ui=None, est=3.726876680329691, details={'was_impossible': False})