In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy.sparse.linalg import svds
import seaborn as sns

In [2]:
ratings=pd.read_csv('../Data/ratings.csv')
movies=pd.read_csv('../Data/movies.csv')
users=pd.read_csv('../Data/user.csv')

In [3]:
users=users.drop(users.columns[0],axis=1)
users.head()
movies=movies.drop(movies.columns[0],axis=1)

In [4]:
ratings

Unnamed: 0,index,user_id,movie_id,rating,timestamp
0,0,1,1193,5,978300760
1,1,1,661,3,978302109
2,2,1,914,3,978301968
3,3,1,3408,4,978300275
4,4,1,2355,5,978824291
...,...,...,...,...,...
1000204,1000204,6040,1091,1,956716541
1000205,1000205,6040,1094,5,956704887
1000206,1000206,6040,562,5,956704746
1000207,1000207,6040,1096,4,956715648


In [6]:
print("Number user:",len(np.unique(ratings['user_id'])))
print("Number movie:",len(np.unique(ratings['movie_id'])))

Number user: 6040
Number movie: 3706


Convert my ratings matrix to be one row per users and one column per movie. To do so, I'll pivot ratings to get that and call the new variable Ratings

In [7]:
Ratings=ratings.pivot(index='user_id',columns='movie_id',values='rating').fillna(0)
Ratings.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
Ratings.iloc[0,:]

movie_id
1       5.0
2       0.0
3       0.0
4       0.0
5       0.0
       ... 
3948    0.0
3949    0.0
3950    0.0
3951    0.0
3952    0.0
Name: 1, Length: 3706, dtype: float64

In [9]:
r=Ratings.values # convert to matrix
print(r.shape)
user_ratings_mean=np.mean(r,axis=1)
print(user_ratings_mean.shape)
rating_mean=r-user_ratings_mean.reshape(-1,1)

(6040, 3706)
(6040,)


In [10]:
U,sigma,Vt=svds(rating_mean,k=50)
print(U.shape)
print(sigma.shape)
print(Vt.shape)

(6040, 50)
(50,)
(50, 3706)


In [11]:
sigma=np.diag(sigma)
print(sigma.shape)

(50, 50)


# Making predictions from the Decomposed Matrices

At this time, we need to make movie ratings predictions for all users. I can do it at once by following the math and matrix multiply $U$,$\Sigma$,$V^T$ back to get the rank K=50 to approximation of A

In [12]:
all_user_predicted_ratings=np.dot(np.dot(U,sigma),Vt)+user_ratings_mean.reshape(-1,1)

In [59]:
predict=pd.DataFrame(all_user_predicted_ratings,columns=Ratings.columns)
predict.head(3)

movie_id,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
0,4.288861,0.143055,-0.19508,-0.018843,0.012232,-0.176604,-0.07412,0.141358,-0.059553,-0.19595,...,0.027807,0.00164,0.026395,-0.022024,-0.085415,0.403529,0.105579,0.031912,0.05045,0.08891
1,0.744716,0.169659,0.335418,0.000758,0.022475,1.35305,0.051426,0.071258,0.161601,1.567246,...,-0.056502,-0.013733,-0.01058,0.062576,-0.016248,0.15579,-0.418737,-0.101102,-0.054098,-0.140188
2,1.818824,0.456136,0.090978,-0.043037,-0.025694,-0.158617,-0.131778,0.098977,0.030551,0.73547,...,0.040481,-0.005301,0.012832,0.029349,0.020866,0.121532,0.076205,0.012345,0.015148,-0.109956


In [58]:
movie_ids=movies['movie_id'].astype('int')

In [74]:
def movie_recommend(userID,number_movie_recommendation):
    user_id=userID #start from 0
    user_data_for_id=ratings[ratings.user_id==user_id]
    already_movie_rated=user_data_for_id.merge(movies,how='inner',left_on='movie_id',right_on='movie_id').sort_values(['rating'])
    print("User ID {} has been voted {} number movies".format(userID,already_movie_rated.shape[0]))
    socres_predict=np.array(predict.iloc[user_id,:])
    c={movie_ids[i]:socres_predict[i] for i in range(len(socres_predict))}
    c=sorted(c.items(),key=lambda x:x[1],reverse=True)[1:number_movie_recommendation]
    indices=[i[0] for i in c]    
    return already_movie_rated,movies.iloc[indices]

In [75]:
already_rated,prediction=movie_recommend(1310,20)

User ID 1310 has been voted 24 number movies


In [76]:
already_rated.head(20)

Unnamed: 0,index,user_id,movie_id,rating,timestamp,title,genres
21,215949,1310,1231,2,974781963,"Right Stuff, The (1983)",Drama
2,215930,1310,1295,2,974782001,"Unbearable Lightness of Being, The (1988)",Drama
14,215942,1310,2313,2,974781839,"Elephant Man, The (1980)",Drama
22,215950,1310,1090,2,974781839,Platoon (1986),Drama|War
0,215928,1310,2988,3,974781935,Melvin and Howard (1980),Drama
19,215947,1310,1960,3,974782001,"Last Emperor, The (1987)",Drama|War
16,215944,1310,144,3,974781573,"Brothers McMullen, The (1995)",Comedy
20,215948,1310,2000,4,974781892,Lethal Weapon (1987),Action|Comedy|Crime|Drama
18,215946,1310,3526,4,974781892,Parenthood (1989),Comedy|Drama
17,215945,1310,3360,4,974781935,Hoosiers (1986),Drama


In [77]:
prediction

Unnamed: 0,movie_id,title,genres
1136,1152,He Walked by Night (1948),Crime|Film-Noir|Thriller
1038,1052,"Proprietor, The (1996)",Drama
1122,1138,Dadetown (1995),Documentary
812,823,"Collectionneuse, La (1967)",Drama
2495,2564,"Empty Mirror, The (1999)",Drama
869,880,"Island of Dr. Moreau, The (1996)",Sci-Fi|Thriller
1147,1163,Mina Tannenbaum (1994),Drama
470,474,In the Line of Fire (1993),Action|Thriller
1196,1214,Alien (1979),Action|Horror|Sci-Fi|Thriller
1016,1029,Dumbo (1941),Animation|Children's|Musical


In [78]:
# Import libraries from Surprise package
from surprise import Reader, Dataset, SVD, evaluate

# Load Reader library
reader = Reader()

# Load ratings dataset with Dataset library
data = Dataset.load_from_df(ratings[['user_id', 'movie_id', 'rating']], reader)

# Split the dataset for 5-fold evaluation
data.split(n_folds=5)

svd = SVD()

# Compute the RMSE of the SVD algorithm.
evaluate(svd, data, measures=['RMSE'])

ModuleNotFoundError: No module named 'surprise'