In [1]:
import pandas as pd
import os
import webbrowser
#View data
data_table = pd.read_csv('movie_ratings_data_set.csv')
#Create a web page view of the data for easy viewing
html = data_table[0:500].to_html()
#Save the html to a temporary file
with open('data.html', 'w') as f:
    f.write(html)
#Open the web page in web browser
full_filename = os.path.abspath('data.html')
#webbrowser.open("file://{}".format(full_filename))

In [2]:
data_table.shape

(680, 3)

In [3]:
data_table.head()

Unnamed: 0,user_id,movie_id,value
0,1,28,4
1,1,26,4
2,1,9,4
3,1,1,4
4,1,14,4


In [4]:
#Movie titles and genres
data_movie = pd.read_csv('movies.csv',index_col='movie_id')
#Create a web page view of the data for easy viewing
html = data_movie[0:500].to_html()
#Save the html to a temporary file
with open('data.html', 'w') as f:
    f.write(html)
#Open the web page in web browser
full_filename = os.path.abspath('data.html')
#webbrowser.open("file://{}".format(full_filename))

In [5]:
data_movie.head()

Unnamed: 0_level_0,title,genre
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,The Sheriff 1,"crime drama, western"
2,The Big City Judge 1,legal drama
3,The Sheriff 2,"crime drama, western"
4,Just a Regular Family,reality
5,The Big City Judge 2,legal drama


In [6]:
#Create movie rating matrix
import numpy as np
df = pd.read_csv('movie_ratings_data_set.csv')
ratings_df = pd.pivot_table(df, index='user_id', columns='movie_id',aggfunc=np.max)
html = ratings_df.to_html(na_rep="")
with open('review_matrix.html','w') as f:
    f.write(html)
#Open the web page in web browser
full_filename = os.path.abspath('review_matrix.html')
#webbrowser.open("file://{}".format(full_filename))

In [7]:
#Save the review matrix as csv
ratings_df.to_csv('review_matrix.csv', na_rep ='')

In [8]:
#Define functions for further usage
import numpy as np
from scipy.optimize import fmin_cg


def normalize_ratings(ratings):
    """
    Given an array of user ratings, subtract the mean of each product's ratings
    :param ratings: 2d array of user ratings
    :return: (normalized ratings array, the calculated means)
    """
    mean_ratings = np.nanmean(ratings, axis=0)
    return ratings - mean_ratings, mean_ratings


def cost(X, *args):
    """
    Cost function for low rank matrix factorization
    :param X: The matrices being factored (P and Q) rolled up as a contiguous array
    :param args: Array containing (num_users, num_products, num_features, ratings, mask, regularization_amount)
    :return: The cost with the current P and Q matrices
    """
    num_users, num_products, num_features, ratings, mask, regularization_amount = args

    # Unroll P and Q
    P = X[0:(num_users * num_features)].reshape(num_users, num_features)
    Q = X[(num_users * num_features):].reshape(num_products, num_features)
    Q = Q.T

    # Calculate current cost
    return (np.sum(np.square(mask * (np.dot(P, Q) - ratings))) / 2) + ((regularization_amount / 2.0) * np.sum(np.square(Q.T))) + ((regularization_amount / 2.0) * np.sum(np.square(P)))


def gradient(X, *args):
    """
    Calculate the cost gradients with the current P and Q.
    :param X: The matrices being factored (P and Q) rolled up as a contiguous array
    :param args: Array containing (num_users, num_products, num_features, ratings, mask, regularization_amount)
    :return: The gradient with the current X
    """
    num_users, num_products, num_features, ratings, mask, regularization_amount = args

    # Unroll P and Q
    P = X[0:(num_users * num_features)].reshape(num_users, num_features)
    Q = X[(num_users * num_features):].reshape(num_products, num_features)
    Q = Q.T

    # Calculate the current gradients for both P and Q
    P_grad = np.dot((mask * (np.dot(P, Q) - ratings)), Q.T) + (regularization_amount * P)
    Q_grad = np.dot((mask * (np.dot(P, Q) - ratings)).T, P) + (regularization_amount * Q.T)

    # Return the gradients as one rolled-up array as expected by fmin_cg
    return np.append(P_grad.ravel(), Q_grad.ravel())


def low_rank_matrix_factorization(ratings, mask=None, num_features=15, regularization_amount=0.01):
    """
    Factor a ratings array into two latent feature arrays (user features and product features)

    :param ratings: Matrix with user ratings to factor
    :param mask: A binary mask of which ratings are present in the ratings array to factor
    :param num_features: Number of latent features to generate for users and products
    :param regularization_amount: How much regularization to apply
    :return: (P, Q) - the factored latent feature arrays
    """
    num_users, num_products = ratings.shape

    # If no mask is provided, consider all 'NaN' elements as missing and create a mask.
    if mask is None:
        mask = np.invert(np.isnan(ratings))

    # Replace NaN values with zero
    ratings = np.nan_to_num(ratings)

    # Create P and Q and fill with random numbers to start
    np.random.seed(0)
    P = np.random.randn(num_users, num_features)
    Q = np.random.randn(num_products, num_features)

    # Roll up P and Q into a contiguous array as fmin_cg expects
    initial = np.append(P.ravel(), Q.ravel())

    # Create an args array as fmin_cg expects
    args = (num_users, num_products, num_features, ratings, mask, regularization_amount)

    # Call fmin_cg to minimize the cost function and this find the best values for P and Q
    X = fmin_cg(cost, initial, fprime=gradient, args=args, maxiter=3000)

    # Unroll the new P and new Q arrays out of the contiguous array returned by fmin_cg
    nP = X[0:(num_users * num_features)].reshape(num_users, num_features)
    nQ = X[(num_users * num_features):].reshape(num_products, num_features)

    return nP, nQ.T


def RMSE(real, predicted):
    """
    Calculate the root mean squared error between a matrix of real ratings and predicted ratings
    :param real: A matrix containing the real ratings (with 'NaN' for any missing elements)
    :param predicted: A matrix of predictions
    :return: The RMSE as a float
    """
    return np.sqrt(np.nanmean(np.square(real - predicted)))

In [9]:
ratings_df = pd.pivot_table(df, index='user_id', columns='movie_id',aggfunc=np.max)
#Apply matrix factorization to find the latent features
U, M = low_rank_matrix_factorization(ratings_df.as_matrix(), num_features=15, regularization_amount = 0.1)

  This is separate from the ipykernel package so we can avoid doing imports until


         Current function value: 32.504384
         Iterations: 3000
         Function evaluations: 4478
         Gradient evaluations: 4478


In [10]:
#Find all predicted ratings by multiplying the U by M
predicted_ratings = np.matmul(U,M)

In [11]:
import pickle
#Save features and predicted ratings to files for later use
pickle.dump(U, open("user_features.dat", "wb"))
pickle.dump(M, open("product_features.dat", "wb"))
pickle.dump(predicted_ratings, open("predicted_ratings.dat", "wb" ))

In [12]:
# Load prediction rules from data files
U = pickle.load(open('user_features.dat','rb'))
M = pickle.load(open('product_features.dat', 'rb'))
predicted_ratings = pickle.load(open('predicted_ratings.dat','rb'))

In [13]:
#Save all the ratings to a csv file
predicted_ratings_df = pd.DataFrame(index=ratings_df.index, columns=ratings_df.columns, data=predicted_ratings)
predicted_ratings_df.to_csv('predicted_rating.csv')

In [14]:
# Find similar products
M=np.transpose(M)

In [15]:
#Choose a movie to find similar movies to. 
movie_id = 11

In [16]:
#Get movie #1's name and genre
movie_information = data_movie.loc[movie_id]

In [17]:
movie_information

title           Inspector Jackson
genre    detective drama, mystery
Name: 11, dtype: object

In [18]:
#Get the features for movie #1 we found via matrix factorization
current_movie_feature = M[movie_id-1]
current_movie_feature

array([ 7.23185567e-01,  8.06454212e-03, -7.14222029e-01, -3.58564459e-01,
       -9.85401253e-01, -1.83175760e+00, -1.97484877e-01, -3.87235907e-02,
        5.82322997e-01,  5.64558472e-05, -1.79591901e-02, -4.28468038e-01,
       -4.11725500e-01, -7.64708933e-01,  4.63513329e-01])

In [19]:
#The main logic for finding similar movies
#1. Subtract the current movie's features from every other movie's features
difference = M - current_movie_feature

#2. Take the absolute value of that difference (so all numbsers are positive)
absolute_diffenrence = np.abs(difference)

#3. Each movie has 15 features. Sum those 15 features to get a total 'diffenrence score' for each movie
total_difference = np.sum(absolute_diffenrence, axis=1)

#4. Create a new column in the movie list with the diffenrece score for each movie
data_movie['difference_score'] = total_difference

#5. Sort the movie list by difference score, from least to most
sorted_movie_list = data_movie.sort_values('difference_score')

#6. Print the result, show the 5 most similar movies to movie #1
print("The five most similar movies are:\n", sorted_movie_list[['title','difference_score']][:5])


The five most similar movies are:
                             title  difference_score
movie_id                                           
11              Inspector Jackson          0.000000
26              Mafia Underground          3.417996
28                  The Sheriff 4          3.461559
2            The Big City Judge 1          4.167416
10        Surrounded by Zombies 1          4.310469


In [20]:
# Make recommendations
print('Enter a user_id to get recommendations (Between 1 and 100):')
user_id_to_search = int(input())

Enter a user_id to get recommendations (Between 1 and 100):
70


In [21]:
print("Movies previously reviewed by user_id {}: ".format(user_id_to_search))

Movies previously reviewed by user_id 70: 


In [22]:
reviewed_movies_df = df[df['user_id'] == user_id_to_search]
reviewed_movies_df = reviewed_movies_df.join(data_movie, on='movie_id')

In [23]:
reviewed_movies_df

Unnamed: 0,user_id,movie_id,value,title,genre,difference_score
469,70,17,3,Singing Telegram,"musical, comedy",9.204921
470,70,18,4,Bad Teachers,comedy,8.426812
471,70,21,4,Political Gaffs,"comedy, political satire",8.531738
472,70,34,5,The Serious Detective,detective drama,8.057612
473,70,19,5,Fake News about Fake News,"satire, comedy",7.480958
474,70,30,5,Post-Apocalyptia 2,"sci-fi, thriller, mystery",9.520933
475,70,33,3,Sports Nerds,comedy,10.313175
476,70,20,4,Buy My App,comedy,8.76021
477,70,32,4,Behind the Scenes,comedy-drama,10.238354
478,70,31,4,My Complicated Family,comedy-drama,10.85626


In [24]:
input('Press enter to continue: ')
print('Movies we will recommend:')
user_ratings = predicted_ratings[user_id_to_search - 1]
data_movie['rating'] = user_ratings

Press enter to continue: 8
Movies we will recommend:


In [25]:
already_reviewed = reviewed_movies_df['movie_id']
recommended_df = data_movie[data_movie.index.isin(already_reviewed) == False]
recommended_df = recommended_df.sort_values(by = ['rating'], ascending = False)
print(recommended_df[['title', 'genre', 'rating']].head(5))

                               title                  genre    rating
movie_id                                                             
13                     The Sheriff 3   crime drama, western  5.179073
6                  Attack on Earth 1         sci-fi, action  4.537884
2               The Big City Judge 1            legal drama  4.493388
15        We Will Fight Those Aliens         sci-fi, action  4.280675
8           Sci-Fi Murder Detectives  supernatural, mystery  4.264366


In [26]:
#Evaluate the model
#Load user ratings for training and testing
raw_training_dataset_df = pd.read_csv('movie_ratings_data_set_training.csv')
raw_testing_dataset_df = pd.read_csv('movie_ratings_data_set_testing.csv')

In [27]:
#Convert the running list of user ratings into a matrix
ratings_training_df = pd.pivot_table(raw_training_dataset_df, index = 'user_id', columns = 'movie_id', aggfunc=np.max)
ratings_testing_df = pd.pivot_table(raw_testing_dataset_df, index = 'user_id', columns = 'movie_id', aggfunc=np.max)

In [28]:
#Apply matrix factorization to find the latent features
U, M = low_rank_matrix_factorization(ratings_training_df.as_matrix(), num_features=15, regularization_amount = 1.1)

  


Optimization terminated successfully.
         Current function value: 315.538580
         Iterations: 697
         Function evaluations: 1061
         Gradient evaluations: 1061


In [29]:
predicted_ratings = np.matmul(U,M)

In [30]:
#Measure RMSE
rmse_training = RMSE(ratings_training_df.as_matrix(), predicted_ratings)
rmse_testing = RMSE(ratings_testing_df.as_matrix(), predicted_ratings)
print('Trainning RMSE: {}'.format(rmse_training))
print('Testing RMSE: {}'.format(rmse_testing))


Trainning RMSE: 0.2495257852238146
Testing RMSE: 1.2096525213412235


  
  This is separate from the ipykernel package so we can avoid doing imports until


We can increase the regularization term to mitigate overfitting, but the tradeoff will be the training RMSE may increase. In this example, the best way is to increase training dataset, users' ratings.

In [32]:
#Handle new users
#Normalize the ratings 
normalized_ratings, means = normalize_ratings(ratings_df.as_matrix())
#Apply matrix factorization to find the latent features
U, M = low_rank_matrix_factorization(normalized_ratings, num_features=15, regularization_amount = 0.1)

  This is separate from the ipykernel package so we can avoid doing imports until


Optimization terminated successfully.
         Current function value: 10.943926
         Iterations: 1877
         Function evaluations: 2787
         Gradient evaluations: 2787


In [33]:
#Find all predicted ratings by multiplying U and M
predicted_ratings = np.matmul(U,M)
#Save a copy of the means 
pickle.dump(means, open('means.dat','wb'))

In [36]:
#Predict movies for new users
#Load prediction rules from data files
mean = pickle.load(open('means.dat','rb'))
#Use the average movie ratings directly as the user's predicted ratings
user_rating = means
print('Movies we will recommend:')
data_movie['rating'] = user_ratings
data_movie = data_movie.sort_values(by=['rating'], ascending = False)
print(data_movie[['title','genre','rating']].head(5))

Movies we will recommend:
                              title                      genre    rating
movie_id                                                                
13                    The Sheriff 3       crime drama, western  5.179073
34            The Serious Detective            detective drama  4.995147
30               Post-Apocalyptia 2  sci-fi, thriller, mystery  4.973627
19        Fake News about Fake News             satire, comedy  4.955316
6                 Attack on Earth 1             sci-fi, action  4.537884


The idea is to recommend the most popular movies to new users.