In [1]:
# reading the data ml-latest-small
import pandas as pd
import zipfile
import numpy as np 
zf = zipfile.ZipFile('/home/elena/Downloads/ml-latest-small.zip')
# reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv(zf.open('ml-latest-small/ratings.csv'), names=r_cols)
m_cols=['movie_id', 'title', 'genre']
movies = pd.read_csv(zf.open('ml-latest-small/movies.csv'), names=m_cols)
# merging ratings and movies
data=pd.merge(ratings,movies,on='movie_id')

In [2]:
data.shape

(100836, 6)

In [3]:
# unique values
unique_movie, unique_user, unique_genre, unique_ratings=data.movie_id.unique().shape[0], data.user_id.unique().shape[0], data.genre.unique().shape[0], data.rating.unique().shape[0]
print('Uniqe users: %5d, Unique movies: %5d, Unique genre: %5d'% (unique_user, unique_movie, unique_genre))
# user_id and movie_id are both sparse categorical variables. They have many possible values; 9724 and 610

Uniqe users:   610, Unique movies:  9724, Unique genre:   951


In [4]:
data.groupby('rating').count()

Unnamed: 0_level_0,user_id,movie_id,unix_timestamp,title,genre
rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.5,1370,1370,1370,1370,1370
1.0,2811,2811,2811,2811,2811
1.5,1791,1791,1791,1791,1791
2.0,7551,7551,7551,7551,7551
2.5,5550,5550,5550,5550,5550
3.0,20047,20047,20047,20047,20047
3.5,13136,13136,13136,13136,13136
4.0,26818,26818,26818,26818,26818
4.5,8551,8551,8551,8551,8551
5.0,13211,13211,13211,13211,13211


In [7]:
# using train / test data saved
train=pd.read_pickle('/home/elena/Downloads/traindata.pkl')
test=pd.read_pickle('/home/elena/Downloads/testdata.pkl')
col_used=data.columns
train=train[col_used]
test=test[col_used]

In [8]:
train.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,title,genre
97717,606,3462,4.0,1171501099,Modern Times (1936),Comedy|Drama|Romance
100124,610,8914,4.0,1493845360,Primer (2004),Drama|Sci-Fi
25952,180,1196,4.0,1270237862,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi
25871,178,2231,4.5,1163673637,Rounders (1998),Drama
97255,605,1588,4.0,1277094877,George of the Jungle (1997),Children|Comedy


In [9]:
train.user_id.unique().shape[0], train.movie_id.unique().shape[0], test.user_id.unique().shape[0], test.movie_id.unique().shape[0]

(610, 8762, 610, 5672)

In [10]:
movies_train=train.movie_id.unique()
movies_test=test.movie_id.unique()
all_movies=data.movie_id.unique()

In [11]:
def intersection(lst1, lst2): 
    lst3 = [value for value in lst1 if value in lst2] 
    return lst3
def set_difference(lst1, lst2): 
    lst3 = [value for value in lst1 if value not in lst2] 
    return lst3

In [12]:
movies_notin_train=set_difference(all_movies,movies_train)
movies_notin_test=set_difference(all_movies,movies_test)
len(movies_notin_train), len(movies_notin_test)

(962, 4052)

In [13]:
# First step: create the user-item matrix for training and testing 
# training data
train_matrix=pd.pivot_table(train, values='rating', index='user_id', columns='movie_id', fill_value=0)
train_dataframe=pd.DataFrame(train_matrix, index=train.user_id.unique(), columns=train.movie_id.unique())
# test data
test_matrix=pd.pivot_table(test, values='rating', index='user_id', columns='movie_id', fill_value=0)
test_dataframe=pd.DataFrame(test_matrix, index=test.user_id.unique(), columns=test.movie_id.unique())

In [29]:
test_dataframe.shape, train_dataframe.shape

((610, 5672), (610, 8762))

In [15]:
# reshaping train matrix to include all movies
adding_to_train=np.zeros((data.user_id.unique().shape[0],len(movies_notin_train)))
adding_to_train=pd.DataFrame( adding_to_train, index=data.user_id.unique(), columns=movies_notin_train)
train_values=pd.concat([train_matrix,adding_to_train], axis=1)
train_values.shape

(610, 9724)

In [30]:
train_values.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,110286,110591,115727,117867,118082,130050,135534,138610,141799,163981
1,4.0,0.0,4.0,0,0.0,4.0,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [57]:
import scipy.sparse as sp 
from scipy.sparse.linalg import svds
# get svds components from train matrix
u, s, vt=  svds(train_values, k=6)
s_diag_matrix=np.diag(s)
# Now you can make a prediction by taking dot product of u, s and v^T
X_pred=np.dot(np.dot(u,s_diag_matrix),vt)
type(X_pred)

numpy.ndarray

In [58]:
# X_pred is a np.array; with rows = users_id and columns = movies_id
# lets transform X_pred into a dataframe
n_users=data.user_id.unique()
n_movies=data.movie_id.unique()
X_predict=pd.DataFrame(X_pred, index=n_users, columns=n_movies)
X_predict.max().max(), X_predict.min().min()
# the ratings in the dataset is from 1 to 5, while the estimated ratings are from -3 to 8!

(8.047211341208374, -3.028408696878355)

In [59]:
# scaling X_predict to range between 1 and 5
min_neg_value=X_predict.min().min()
max_value=X_predict.max().max()
X_scale=((X_pred-min_neg_value-0.8)/max_value)*3 + 1
X_predict_scale=pd.DataFrame(X_scale, index=n_users, columns=n_movies)
X_predict_scale.max().max(), X_predict_scale.min().min()

(4.830750654751812, 0.701760038572615)

In [60]:
X_predict_scale=X_predict_scale.round(0)
X_predict_scale.max().max(), X_predict_scale.min().min()

(5.0, 1.0)

In [61]:
X_predict_scale.shape

(610, 9724)

In [62]:
# evaluation
from sklearn.metrics import mean_absolute_error
mean_absolute_error(X_predict_scale[test_dataframe.columns], test_dataframe)

2.000829787971976

In [31]:
# recommending
def recommendations(userid, n_movies):
    movies_to_recommend_from=set_difference(all_movies,movies_train) # all movies not in train
    s=pd.DataFrame()
    for i in movies_to_recommend_from:
        s[i]=X_predict_scale[X_predict_scale.index==userid][i]
    predictions=np.transpose(s)
    print('For user %d we make the following recommendations:' %userid)
    return predictions

In [35]:
user_ratings = train[train.user_id==1]
print("User #{} has rated {} movies from train set(avg. rating = {:.1f}):".format(
    1, len(user_ratings), user_ratings['rating'].mean(),
))
cols = ['user_id', 'movie_id', 'rating', 'title']
user_ratings.sort_values(by='rating', ascending=False)[cols]

User #1 has rated 186 movies from train set(avg. rating = 4.3):


Unnamed: 0,user_id,movie_id,rating,title
90,1,1298,5.0,Pink Floyd: The Wall (1982)
82,1,1256,5.0,Duck Soup (1933)
67,1,1136,5.0,Monty Python and the Holy Grail (1975)
9,1,157,5.0,Canadian Bacon (1995)
111,1,1804,5.0,"Newton Boys, The (1998)"
...,...,...,...,...
152,1,2389,2.0,Psycho (1998)
148,1,2338,2.0,I Still Know What You Did Last Summer (1998)
76,1,1219,2.0,Psycho (1960)
170,1,2617,2.0,"Mummy, The (1999)"


In [50]:
preds_user1=recommendations(1,10)
preds_user1.columns=['predicted_ratings']
preds_user1['movie_id']=preds_user1.index
movie_name=lambda movie: data.title[data.movie_id==movie]
movie_genre=lambda movie: data.genre[data.movie_id==movie]
preds_user1['title']=preds_user1.movie_id.map(movie_name)
preds_user1['genre']=preds_user1.movie_id.map(movie_genre)
preds_user1.sort_values(by='predicted_ratings', ascending=False).head(10)

For user 1 we make the following recommendations:


Unnamed: 0,predicted_ratings,movie_id,title,genre
7899,3.0,7899,18826 Master of the Flying Guillotine (Du b...,"18826 Action Name: genre, dtype: object"
6143,3.0,6143,66274 Trail of the Pink Panther (1982) 6627...,66274 Comedy|Crime 66275 Comedy|Crime Na...
117368,3.0,117368,68694 The Madagascar Penguins in a Christma...,"68694 Animation|Comedy Name: genre, dtype: ..."
148675,3.0,148675,68895 North Pole: Open For Christmas (2015)...,"68895 Children|Fantasy Name: genre, dtype: ..."
160872,3.0,160872,"68899 Satanic (2016) Name: title, dtype: ob...","68899 Horror Name: genre, dtype: object"
7924,3.0,7924,70503 Stray Dog (Nora inu) (1949) Name: tit...,"70503 Drama|Film-Noir|Thriller Name: genre,..."
121342,3.0,121342,"86814 Carry on Cruising (1962) Name: title,...","86814 Comedy|Romance Name: genre, dtype: ob..."
121007,3.0,121007,"86810 Space Buddies (2009) Name: title, dty...",86810 Adventure|Children|Fantasy|Sci-Fi Nam...
60943,3.0,60943,"73803 Frozen River (2008) Name: title, dtyp...","73803 Drama Name: genre, dtype: object"
109596,3.0,109596,86786 Wrinkles (Arrugas) (2011) Name: title...,"86786 Animation|Drama Name: genre, dtype: o..."


In [46]:
name_movie=lambda name: data.movie_id[data.title==name].unique()
name_movie('Alien (1979)')

array([1214])

In [48]:
X_predict_scale[X_predict_scale.index==1][1214]

1    2.0
Name: 1214, dtype: float64