In [1]:
# Collaborative filtering (CF) - Model based - Matrix Factorization (MF)
PYTHONHASHSEED=0 

In [2]:
# reading the data ml-latest-small
import pandas as pd
import zipfile
import numpy as np 
zf = zipfile.ZipFile('/home/elena/Downloads/ml-latest-small.zip')
# reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv(zf.open('ml-latest-small/ratings.csv'), names=r_cols)
# reading movie file
m_cols=['movie_id', 'title', 'genre']
movies = pd.read_csv(zf.open('ml-latest-small/movies.csv'), names=m_cols)

In [3]:
# merging ratings and movies
data=pd.merge(ratings,movies,on='movie_id')
data.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,title,genre
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [4]:
data.shape

(100836, 6)

In [5]:
# unique values
unique_movie, unique_user, unique_genre, unique_ratings=data.movie_id.unique().shape[0], data.user_id.unique().shape[0], data.genre.unique().shape[0], data.rating.unique().shape[0]
print('Uniqe users: %5d, Unique movies: %5d, Unique genre: %5d'% (unique_user, unique_movie, unique_genre))
# user_id and movie_id are both sparse categorical variables. They have many possible values; 9724 and 610

Uniqe users:   610, Unique movies:  9724, Unique genre:   951


In [6]:
data.groupby('rating').count()

Unnamed: 0_level_0,user_id,movie_id,unix_timestamp,title,genre
rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.5,1370,1370,1370,1370,1370
1.0,2811,2811,2811,2811,2811
1.5,1791,1791,1791,1791,1791
2.0,7551,7551,7551,7551,7551
2.5,5550,5550,5550,5550,5550
3.0,20047,20047,20047,20047,20047
3.5,13136,13136,13136,13136,13136
4.0,26818,26818,26818,26818,26818
4.5,8551,8551,8551,8551,8551
5.0,13211,13211,13211,13211,13211


In [7]:
# splitting into train and test data
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.25, random_state=42)
train.shape, test.shape

((75627, 6), (25209, 6))

In [8]:
train.user_id.unique().shape[0], train.movie_id.unique().shape[0], test.user_id.unique().shape[0], test.movie_id.unique().shape[0]

(610, 8767, 610, 5627)

In [9]:
movies_train=train.movie_id.unique()
movies_test=test.movie_id.unique()
all_movies=data.movie_id.unique()

In [10]:
def intersection(lst1, lst2): 
    lst3 = [value for value in lst1 if value in lst2] 
    return lst3
def set_difference(lst1, lst2): 
    lst3 = [value for value in lst1 if value not in lst2] 
    return lst3

In [11]:
movies_notin_train=set_difference(all_movies,movies_train)
movies_notin_test=set_difference(all_movies,movies_test)
len(movies_notin_train), len(movies_notin_test)

(957, 4097)

In [12]:
# First step: create the user-item matrix for training and testing 
# training data
train_matrix=pd.pivot_table(train, values='rating', index='user_id', columns='movie_id', fill_value=0)
train_dataframe=pd.DataFrame(train_matrix, index=train.user_id.unique(), columns=train.movie_id.unique())
# test data
test_matrix=pd.pivot_table(test, values='rating', index='user_id', columns='movie_id', fill_value=0)
test_dataframe=pd.DataFrame(test_matrix, index=test.user_id.unique(), columns=test.movie_id.unique())

In [13]:
# reshaping test and train matrices to be of the same size, necessary for the mean squared error
adding_to_test=np.zeros((data.user_id.unique().shape[0],len(movies_notin_test)))
adding_to_test=pd.DataFrame( adding_to_test, index=data.user_id.unique(), columns=movies_notin_test)
test_values=pd.concat([test_dataframe,adding_to_test], axis=1)
test_values.shape

(610, 9724)

In [14]:
adding_to_train=np.zeros((data.user_id.unique().shape[0],len(movies_notin_train)))
adding_to_train=pd.DataFrame( adding_to_train, index=data.user_id.unique(), columns=movies_notin_train)
train_values=pd.concat([train_matrix,adding_to_train], axis=1)
train_values.shape

(610, 9724)

In [15]:
import scipy.sparse as sp 
from scipy.sparse.linalg import svds
# get svds components from train matrix
u, s, vt=  svds(train_values, k=20)
s_diag_matrix=np.diag(s)
# Now you can make a prediction by taking dot product of u, s and v^T
X_pred=np.dot(np.dot(u,s_diag_matrix),vt)
type(X_pred)

numpy.ndarray

In [16]:
# X_pred is a np.array; with rows = users_id and columns = movies_id
# lets transform X_pred into a dataframe
n_users=data.user_id.unique()
n_movies=data.movie_id.unique()
X_predict=pd.DataFrame(X_pred, index=n_users, columns=n_movies)
X_predict.max().max(), X_predict.min().min()
# the ratings in the dataset is from 1 to 5, while the estimated ratings are from -3 to 8!

(8.129032767692083, -3.352213210772039)

In [17]:
# scaling X_predict to range between 1 and 5
min_neg_value=X_predict.min().min()
max_value=X_predict.max().max()
X_scale=((X_pred-min_neg_value-0.8)/max_value)*3 + 1
X_predict_scale=pd.DataFrame(X_scale, index=n_users, columns=n_movies)
X_predict_scale.max().max(), X_predict_scale.min().min()

(4.941888149688184, 0.7047619232710529)

In [18]:
# evaluation
from sklearn.metrics import mean_squared_error
from math import sqrt
sqrt(mean_squared_error(X_predict_scale[test_values.columns], test_values))

1.9594097000359225

In [33]:
# recommending
def recommendations(userid, n_movies):
    movies_to_recommend_from=set_difference(all_movies,movies_train) # all movies not in train
    s=pd.DataFrame()
    for i in movies_to_recommend_from:
        s[i]=X_predict_scale[X_predict_scale.index==userid][i]
    predictions=np.transpose(s)
    print('For user %d we make the following recommendations:' %userid)
    return predictions

In [34]:
user_ratings = train[train.user_id==1]
print("User #{} has rated {} movies (avg. rating = {:.1f}):".format(
    1, len(user_ratings), user_ratings['rating'].mean(),
))
cols = ['user_id', 'movie_id', 'rating', 'title']
user_ratings.sort_values(by='rating', ascending=False)[cols]

User #1 has rated 175 movies (avg. rating = 4.4):


Unnamed: 0,user_id,movie_id,rating,title
2998,1,457,5.0,"Fugitive, The (1993)"
8108,1,1220,5.0,"Blues Brothers, The (1980)"
11157,1,2090,5.0,"Rescuers, The (1977)"
14323,1,2959,5.0,Fight Club (1999)
6410,1,1092,5.0,Basic Instinct (1992)
...,...,...,...,...
8025,1,1219,2.0,Psycho (1960)
12997,1,2617,2.0,"Mummy, The (1999)"
11663,1,2253,2.0,Toys (1992)
11991,1,2338,2.0,I Still Know What You Did Last Summer (1998)


In [44]:
preds_user1=recommendations(1,10)
preds_user1.columns=['predicted_ratings']
preds_user1['movie_id']=preds_user1.index
preds_user1.sort_values(by='predicted_ratings', ascending=False).head(10)

For user 1 we make the following recommendations:


Unnamed: 0,predicted_ratings,movie_id
77841,2.944065,77841
96004,2.415449,96004
310,2.352819,310
115203,2.334662,115203
2898,2.313721,2898
137595,2.166146,137595
6239,2.15524,6239
8511,2.145279,8511
5256,2.128797,5256
685,2.12344,685
