In [6]:
import pandas as pd
import numpy as np

In [7]:
movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")
ratings.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224


In [8]:
ratings.drop("timestamp", axis=1, inplace=True)

In [None]:
#ratings_matrix = ratings.pivot_table('rating', index='userId', columns='movieId')

In [None]:
#ratings_matrix.head(3)

In [9]:
ratings_movie = pd.merge(ratings, movies, on='movieId')

In [10]:
ratings_movie.head(3)

Unnamed: 0,userId,movieId,rating,title,genres
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [12]:
ratings_matrix = ratings_movie.pivot_table('rating', index="userId", columns='title')
ratings_matrix

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,,,,,,,,,,,...,,,,,,,,,,
607,,,,,,,,,,,...,,,,,,,,,,
608,,,,,,,,,,,...,,,,,,4.5,3.5,,,
609,,,,,,,,,,,...,,,,,,,,,,


In [15]:
from sklearn.metrics import mean_squared_error

def get_rmse(R, P, Q, non_zeros):
    error = 0
    
    full_pred_matrix = np.dot(P, Q.T)
    
    # Non-null Actual matrix-> RMSE
    x_non_zero_ind = [non_zero[0] for non_zero in non_zeros]
    y_non_zero_ind = [non_zero[1] for non_zero in non_zeros]
    R_non_zeros = R[x_non_zero_ind, y_non_zero_ind] # Actual non-zero
    
    full_pred_matrix_non_zeros = full_pred_matrix[x_non_zero_ind, y_non_zero_ind] # Predcited non-zero
    
    mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros)
    rmse = np.sqrt(mse)
    
    return rmse

In [24]:
def matrix_factorization(R, K, steps=200, learning_rate=0.01, r_lambda=0.01):
    num_users, num_items = R.shape
    
    # Generate random P, Q matrix
    np.random.seed(1)
    P = np.random.normal(scale=1./K, size=(num_users, K))
    Q = np.random.normal(scale=1./K, size=(num_items, K))
    
    #prev_rmse = 10000
    #break_count = 0
    
    # Save value, index, column if R > 0 to a list
    non_zeros = [(i, j, R[i, j]) for i in range(num_users) for j in range(num_items) if R[i, j] > 0]
    
    # SGD
    for step in range(steps):
        for i, j, r in non_zeros:
            eij = r - np.dot(P[i, :], Q[j, :].T)
            P[i, :] = P[i, :] + learning_rate * (eij * Q[j, :] - r_lambda * P[i, :])
            Q[j, :] = Q[j, :] + learning_rate * (eij * P[i, :] - r_lambda * Q[j, :])
            
        
        rmse = get_rmse(R, P, Q, non_zeros)
        if (step % 10) == 0:
            print('Iter step: ', step, " rmse: ", rmse)
    
    return P, Q

In [25]:
# Matrix Factorization (K=Latent factor dim, steps=number of SGD, r_lambda = coef of R2 Regularization )
P, Q = matrix_factorization(ratings_matrix.values, K=50, steps=200, learning_rate=0.01, r_lambda=0.01)

Iter step:  0  rmse:  2.9023619751336867
Iter step:  10  rmse:  0.7335768591017927
Iter step:  20  rmse:  0.5115539026853442
Iter step:  30  rmse:  0.37261628282537446
Iter step:  40  rmse:  0.29608182991810134
Iter step:  50  rmse:  0.2520353192341642
Iter step:  60  rmse:  0.22487503275269854
Iter step:  70  rmse:  0.20685455302331535
Iter step:  80  rmse:  0.19413418783028688
Iter step:  90  rmse:  0.18470082002720406
Iter step:  100  rmse:  0.17742927527209104
Iter step:  110  rmse:  0.17165226964707492
Iter step:  120  rmse:  0.16695181946871726
Iter step:  130  rmse:  0.16305292191997542
Iter step:  140  rmse:  0.15976691929679646
Iter step:  150  rmse:  0.1569598699945732
Iter step:  160  rmse:  0.15453398186715428
Iter step:  170  rmse:  0.15241618551077643
Iter step:  180  rmse:  0.15055080739628307
Iter step:  190  rmse:  0.1488947091323209


In [26]:
pred_matrix = np.dot(P, Q.T)

In [29]:
pred_matrix_df = pd.DataFrame(data=pred_matrix, index=ratings_matrix.index, columns=ratings_matrix.columns)
pred_matrix_df.head(3)

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.055084,4.092018,3.56413,4.502167,3.981215,1.271694,3.603274,2.333266,5.091749,3.972454,...,1.402608,4.208382,3.705957,2.720514,2.787331,3.475076,3.253458,2.161087,4.010495,0.859474
2,3.170119,3.657992,3.308707,4.166521,4.31189,1.275469,4.237972,1.900366,3.392859,3.647421,...,0.973811,3.528264,3.361532,2.672535,2.404456,4.232789,2.911602,1.634576,4.135735,0.725684
3,2.307073,1.658853,1.443538,2.208859,2.229486,0.78076,1.997043,0.924908,2.9707,2.551446,...,0.520354,1.709494,2.281596,1.782833,1.635173,1.323276,2.88758,1.042618,2.29389,0.396941


In [47]:
def get_unseen_movies(ratings_matrix, userId):
    user_rating = ratings_matrix.loc[userId, :]
    
    already_seen = user_rating[user_rating > 0].index.tolist() #list
    movie_list = ratings_matrix.columns.tolist()
    
    unseen_list = [movie for movie in movie_list if movie not in already_seen]
    
    return unseen_list

In [58]:
def recom_movie_by_userid(pred_df, userId, unseen_list, top_n=10):
    recom_movies = pred_df.loc[userId, unseen_list].sort_values(ascending=False)[:top_n]
    return recom_movies

In [59]:
unseen_list = get_unseen_movies(ratings_matrix, userId = 9)

In [60]:
recom_movies = recom_movie_by_userid(pred_matrix_df, 9, unseen_list, top_n=10)

In [61]:
recom_movies

title
Rear Window (1954)                                       5.704612
South Park: Bigger, Longer and Uncut (1999)              5.451100
Rounders (1998)                                          5.298393
Blade Runner (1982)                                      5.244951
Roger & Me (1989)                                        5.191962
Gattaca (1997)                                           5.183179
Ben-Hur (1959)                                           5.130463
Rosencrantz and Guildenstern Are Dead (1990)             5.087375
Big Lebowski, The (1998)                                 5.038690
Star Wars: Episode V - The Empire Strikes Back (1980)    4.989601
Name: 9, dtype: float64