In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import sparse

%matplotlib inline

In [4]:
# read in the datasets
movies = pd.read_csv('./data/movies_clean.csv')
reviews = pd.read_csv('./data/reviews_clean.csv')

del movies['Unnamed: 0']
del reviews['Unnamed: 0']

In [6]:
reviews[reviews['user_id'] == 73486]

Unnamed: 0,user_id,movie_id,rating,timestamp,date,month_1,month_2,month_3,month_4,month_5,...,month_9,month_10,month_11,month_12,year_2013,year_2014,year_2015,year_2016,year_2017,year_2018


In [7]:
reviews[reviews['movie_id'] == 73486]

Unnamed: 0,user_id,movie_id,rating,timestamp,date,month_1,month_2,month_3,month_4,month_5,...,month_9,month_10,month_11,month_12,year_2013,year_2014,year_2015,year_2016,year_2017,year_2018
1611,119,73486,9,1422583273,2015-01-30 02:01:13,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2745,169,73486,9,1364496320,2013-03-28 18:45:20,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2800,187,73486,7,1380913548,2013-10-04 19:05:48,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
3725,265,73486,10,1435715278,2015-07-01 01:47:58,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3852,272,73486,10,1362125755,2013-03-01 08:15:55,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
707097,53588,73486,10,1468646231,2016-07-16 05:17:11,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
708390,53656,73486,10,1451949786,2016-01-04 23:23:06,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
708571,53695,73486,8,1436313055,2015-07-07 23:50:55,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
710711,53871,73486,9,1446208671,2015-10-30 12:37:51,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0


In [11]:
# create a data subset
movies_subset = [73486, 75314, 68646, 99685]
reviews_subset = reviews[reviews['movie_id'].isin(movies_subset)]

# create user-by-item matrix
user_items_subset = reviews_subset[['user_id', 'movie_id', 'rating', 'timestamp']]
user_by_movie_subset = user_items_subset.groupby(['user_id', 'movie_id'])['rating'].max().unstack()
ratings_mat = np.matrix(user_by_movie_subset)
print(ratings_mat)

[[10. nan nan nan]
 [10. nan nan nan]
 [ 9. nan nan nan]
 ...
 [nan nan 10. 10.]
 [nan  9. nan nan]
 [nan  8. nan nan]]


In [21]:
if ~np.isnan(ratings_mat[0,1]):
    print('fk')

In [22]:
u = np.array([
    [2, 5],
    [8, 4],
    [9, 6],
    [4, 8]
])

v = np.array([
    [-2, 4, 8, 3],
    [4, 7, -1, -5],
])

In [25]:
v[:, 0]

array([-2,  4])

16

In [33]:
def FunkSVD(ratings_mat, latent_features=4, learning_rate=0.0001, iters=100):
    ''' This function performs matrix factorization using a basic form of
        FunkSVD with no regularization
        
        Args:
            ratings_mat - (numpy array) a matrix with users as rows, movies as
                columns, and ratings as values
            latent_features - (int) number of latent features used
            learning_rate - (float) learning rate
            inters - (int) the number of iterations
            
        Returns:
            user_mat - (numpy array) a user by latent feature matrix
            movie_mat - (numpy array) a latent feature by movie matrix
        
    '''
    
    # set up useful values to be used through the rest of the function
    n_users = ratings_mat.shape[0]
    n_movies = ratings_mat.shape[1]
    num_ratings = np.count_nonzero(~np.isnan(ratings_mat))
    
    # initialize the user and movie matrices with random values
    user_mat = np.random.rand(n_users, latent_features)
    movie_mat = np.random.rand(latent_features, n_movies)
    
    # initialize sse at 0 for first iteration
    sse_accum = 0
    
    # header for running results
    print('Optimization Statistics')
    print('Iteration\t| Mean Squared Error')
    
    # for each iteration
    for i in range(iters):
        
        # update our sse
        old_sse = sse_accum
        sse_accum = 0
        
        # for each user-movie pair
        for user in range(n_users):
            for movie in range(n_movies):
                # if the rating exists
                if ratings_mat[user, movie] > 0:
                    
                    # compute the error as the actual minus the dot product
                    # of the user and movie latent features
                    error = ratings_mat[user, movie] - np.dot(user_mat[user, :], movie_mat[:, movie])
                    
                    # keep track of the total sum of squared errors for the matrix
                    sse_accum += error**2
                    
                    # update the values in each matrix in the direction of the gradient
                    for k in range(latent_features):
                        user_mat[user, k] += learning_rate * (2 * error * movie_mat[k, movie])
                        movie_mat[k, movie] += learning_rate * (2 * error * user_mat[user, k])
                    
        # print results for each iteration
        print('{} \t\t| {}'.format(i+1, sse_accum / num_ratings))
        
    return user_mat, movie_mat

In [34]:
# use function with 4 latent features, lr of 0.005 and 10 iterations
user_mat, movie_mat = FunkSVD(ratings_mat, latent_features=4, learning_rate=0.005, iters=10) 

Optimization Statistics
Iteration	| Mean Squared Error
0 		| 23058.167675767836
1 		| 4173.314319681984
2 		| 1805.9639073266296
3 		| 1453.931495108896
4 		| 1372.9109567147755
5 		| 1334.4292625548055
6 		| 1299.8137685780673
7 		| 1261.0791193831838
8 		| 1215.9270990944121
9 		| 1162.8414811993057


In [35]:
#Compare the predicted and actual results
print(np.dot(user_mat, movie_mat))
print(ratings_mat)

[[10.05646676  9.78137667  8.9940508   9.63271309]
 [10.11664968  9.08808264  8.76236892  9.16145548]
 [ 8.9642857   9.67142134  8.28844561  9.51845913]
 ...
 [10.73908586 10.35693212  9.27246528 10.1971345 ]
 [ 9.63192249  9.00359679  8.52080901  8.99254339]
 [ 8.1634822   8.0140148   6.89819949  7.67280614]]
[[10. nan nan nan]
 [10. nan nan nan]
 [ 9. nan nan nan]
 ...
 [nan nan 10. 10.]
 [nan  9. nan nan]
 [nan  8. nan nan]]


In [36]:
#use your function with 4 latent features, lr of 0.005 and 250 iterations
user_mat, movie_mat = FunkSVD(ratings_mat, latent_features=4, learning_rate=0.005, iters=250)

Optimization Statistics
Iteration	| Mean Squared Error
0 		| 22121.4801720508
1 		| 4372.177877978378
2 		| 1796.8032360496582
3 		| 1432.652967269315
4 		| 1342.9669386750581
5 		| 1295.92124773883
6 		| 1254.9788389622618
7 		| 1212.617525153679
8 		| 1166.3207640055355
9 		| 1114.2045264091325
10 		| 1054.5758352951939
11 		| 986.6425427185067
12 		| 911.1192898214205
13 		| 830.3341323261498
14 		| 747.788226585361
15 		| 667.2381837338015
16 		| 591.6430582973791
17 		| 522.5128535823117
18 		| 459.9748389062682
19 		| 403.37843692179825
20 		| 351.9663335877322
21 		| 305.25812321408114
22 		| 263.0931837137818
23 		| 225.4754402857452
24 		| 192.38727137605153
25 		| 163.6728478929619
26 		| 139.0143807568636
27 		| 117.97559877419994
28 		| 100.07117208356341
29 		| 84.82897136820378
30 		| 71.8293227698986
31 		| 60.720292696551226
32 		| 51.21584453100679
33 		| 43.085233317754025
34 		| 36.140122744811215
35 		| 30.223143141238758
36 		| 25.19932085451331
37 		| 20.950397359

In [37]:
#Compare the predicted and actual results
print(np.dot(user_mat, movie_mat))
print(ratings_mat)

[[10.         11.27287929  8.71504627 10.615213  ]
 [10.          7.30007911  7.94959479  7.89495745]
 [ 9.          9.8434353   8.98883418  9.14350654]
 ...
 [12.88551186 10.2529267  10.         10.        ]
 [ 9.06249771  9.          6.17498712  7.4329903 ]
 [ 7.97175245  8.          5.6628727   7.52132867]]
[[10. nan nan nan]
 [10. nan nan nan]
 [ 9. nan nan nan]
 ...
 [nan nan 10. 10.]
 [nan  9. nan nan]
 [nan  8. nan nan]]


In [38]:
def create_train_test(reviews, order_by, train_size, test_size):
    '''
        Args:
            reviews - (pandas df) dataframe to split into train and test
            order_by - (string) column name to sort by
            train_size - (int) number of rows in training set
            test_size - (int) number of columns in test set
            
        Returns:
            train_df - (pandas df) dataframe of the training_set
            validation_df - (pandas df) dataframe of test set
    '''
    
    ordered_reviews = reviews.sort_values(by=order_by)
    train_df = ordered_reviews.iloc[:train_size, :]
    validation_df = ordered_reviews.iloc[train_size:(train_size + test_size), :]
    
    return train_df, validation_df

In [39]:
train_df, val_df = create_train_test(reviews, 'date', 8000, 2000)

In [40]:
# create user-by-item matrix
train_user_item = train_df[['user_id', 'movie_id', 'rating', 'timestamp']]
train_data_df = train_user_item.groupby(['user_id', 'movie_id'])['rating'].max().unstack()
train_data_np = np.array(train_data_df)

In [41]:
# fit FunkSVF with the specified hyper parameters to the training data
user_mat, movie_mat = FunkSVD(train_data_np, latent_features=15, learning_rate=0.005, iters=250)

Optimization Statistics
Iteration	| Mean Squared Error
0 		| 84835.85481686002
1 		| 47100.79770481531
2 		| 32888.16970623109
3 		| 24653.90760824317
4 		| 19250.26061749317
5 		| 15427.124436843074
6 		| 12588.993232335755
7 		| 10413.108276263278
8 		| 8706.048620213547
9 		| 7343.221043942525
10 		| 6240.0924892571875
11 		| 5336.995521567193
12 		| 4590.468182979384
13 		| 3968.069105096982
14 		| 3445.1494826661037
15 		| 3002.7488822475743
16 		| 2626.1467619903883
17 		| 2303.8158991820524
18 		| 2026.644517401264
19 		| 1787.3535988804083
20 		| 1580.0620164133854
21 		| 1399.9639364128268
22 		| 1243.0902104964637
23 		| 1106.1315139288715
24 		| 986.3063090671944
25 		| 881.2611450046019
26 		| 788.994271344716
27 		| 707.7961342289292
28 		| 636.2022048856319
29 		| 572.9549364564224
30 		| 516.972595918915
31 		| 467.32338312536785
32 		| 423.2037089196623
33 		| 383.91981867510026
34 		| 348.8721606141619
35 		| 317.5420423432784
36 		| 289.480217395422
37 		| 264.2971123

In [42]:
def predict_rating(user_matrix, movie_matrix, user_id, movie_id):
    '''
        Args:
            user_matrix - (numpy array) user by latent factor matrix
            movie_matrix - (numpy array) latent factor by movie matrix
            user_id - (int) the user_id from the reviews df
            movie_id - (int) the movie_id according to the movies df
            
        Returns:
            pred - the predicted rating for user_id-movie_id according to FunkSVD
    '''
    
    # create series of users and movies in the right order
    user_ids_series = np.array(train_data_df.index)
    movie_ids_series = np.array(train_data_df.columns)
    
    # user row and movie column
    user_row = np.where(user_ids_series == user_id)[0][0]
    movie_col = np.where(movie_ids_series == movie_id)[0][0]
    
    # take dot product of that row and column in U and V to make prediction
    pred = np.dot(user_matrix[user_row, :], movie_matrix[:, movie_col])
    
    return pred

In [43]:
# testing function with the first user-movie in the user-movie matrix (which is a nan)
pred_val = predict_rating(user_mat, movie_mat, 8, 2844)
pred_val

6.641095395643654

In [48]:
def print_prediction_summary(user_id, movie_id, prediction):
    ''' 
        Args:
            user_id - (int) the user_id from the reviews_df
            movie_id - (int) the movie_id according the movies df
            prediction - (float) predicted rating for user_id-movie_id
            
        Returns:
            None - prints a statement about the user, movie, and prediction made
    '''
    
    movie_name = str(movies[movies['movie_id'] == movie_id]['movie'])
    movie_name = movie_name.replace('\nName: movie, dtype: object', '')
    
    print("For user {} we predict a {} rating for the movie {}".format(user_id, prediction, movie_name))

In [49]:
# testing the function
print_prediction_summary(8, 2844, pred_val)

For user 8 we predict a 6.641095395643654 rating for the movie 15    Fantômas - À l'ombre de la guillotine (1913)


In [51]:
def validation_comparison(val_df, num_preds):
    '''
        Args:
            val_df - (pandas dataframe) the validation dataset for our model
            num_preds - (int) the number of rows (going in order) you would like to make predictions for
            
        Returns:
            None - prints a statement about the prediction made for each row of val_df
                from row 0 to num_preds
    '''
    
    val_users = np.array(val_df['user_id'])
    val_movies = np.array(val_df['movie_id'])
    val_ratings = np.array(val_df['rating'])
    
    for idx in range(num_preds):
        pred = predict_rating(user_mat, movie_mat, val_users[idx], val_movies[idx])
        print("The actual rating for user {} on movie {} is {}.\nWhile the predicted rating is {}.".format(val_users[idx], val_movies[idx], val_ratings[idx], pred))
    
# perform predictions vs. the actual for the first 6 rows
validation_comparison(val_df, 6)

The actual rating for user 49056 on movie 1598822 is 8.
While the predicted rating is 7.384637950921652.
The actual rating for user 49056 on movie 289879 is 9.
While the predicted rating is 9.140269044599378.
The actual rating for user 49056 on movie 1563738 is 9.
While the predicted rating is 8.946170801803827.
The actual rating for user 49056 on movie 1458175 is 4.
While the predicted rating is 7.445258334685359.
The actual rating for user 28599 on movie 103639 is 8.
While the predicted rating is 7.320750184681668.
The actual rating for user 50593 on movie 1560985 is 4.
While the predicted rating is 4.061766555971241.
