<a href="https://colab.research.google.com/github/ethanstykes/ncf/blob/master/Neural_Collaborative_Filtering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Neural Collaborative Filtering

Neural network based collaborative filtering for recommending new products by analyzing feedbacks from users. Intended to be utilized in areas including movies, music, news, books, and products in general. In this project, I demonstrate movie recommandation using the Netflix Prize dataset, learning from implcit feedbacks.

In [0]:
import numpy as np
import os
import tensorflow as tf
import time

  return f(*args, **kwds)


### Extracting data from files to create a user-movie matrix
The datasets contain over 100 million ratings from 480 thousand
randomly-chosen, anonymous Netflix customers over 17 thousand movie titles.

The "training_set" directory contains 17770 files, one
per movie.  The first line of each file contains the movie id followed by a
colon.  Each subsequent line in the file corresponds to a rating from a customer
and its date in the following format:

CustomerID,Rating,Date

- MovieIDs range from 1 to 17770 sequentially.
- CustomerIDs range from 1 to 2649429, with gaps. There are 480189 users.
- Ratings are on a five star (integral) scale from 1 to 5.
- Dates have the format YYYY-MM-DD.

We ignore the dates and extract user id's and corresponding movie ratings to form a user-movie matrix.

In [0]:
tic = time.time()
num_movies = 17770
num_user_ids = 2649429 
num_users = 480189
user_movies = np.zeros((num_users, num_movies))
user_dict = {} # user_id -> user_row
movie_ids = []

user_count = 0
file_count = 0

for filename in os.listdir("dataset/training_set/"):
    movie_file = open("dataset/training_set/"+filename)
    movie_data = movie_file.read().split("\n")
    movie_id = int(movie_data[0].strip(":"))
    movie_ids.append(movie_id)
    #print(movie_id)
    for i in range(1, len(movie_data) -1 ):
        user_rating = movie_data[i].split(",")
        user_id = user_rating[0]
        rating = user_rating[1]
        #print(user_id)
        if user_id not in user_dict:
            user_dict[user_id] = user_count
            user_movies[user_count, movie_id - 1] = rating
            user_count += 1
        else:
            user_movies[user_dict[user_id], movie_id - 1] = rating
    if movie_id > 17770: 
        break
    if file_count%1000 == 0:
        print("Files loaded:", file_count)
    file_count+=1
    
toc = time.time()
print("time elapsed:",(toc - tic))
print("number of users:", user_count)

Files loaded: 0
Files loaded: 1000
Files loaded: 2000
Files loaded: 3000
Files loaded: 4000
Files loaded: 5000
Files loaded: 6000
Files loaded: 7000
Files loaded: 8000
Files loaded: 9000
Files loaded: 10000
Files loaded: 11000
Files loaded: 12000
Files loaded: 13000
Files loaded: 14000
Files loaded: 15000
Files loaded: 16000
Files loaded: 17000
time elapsed: 745.8722639083862
number of users: 480189


In [0]:
#analyze the data
user_id = 1488844
movie_id = 1

print(user_movies[user_dict[str(user_id)], movie_id - 1])
j=0
for rating in user_movies[user_dict["1956732"]]:
    if rating>0:
        j+=1
print(j)

3.0
167


### Processing the user-movie matrix to create an input matrix with sparse vectors as rows 

Each row of the input matrix will contain a concatenation of feature vectors of users and movies. Corresponding ratings are stored in a different vector.

In [0]:
tic = time.time()

user_movies_train_users = user_movies[:10000]
#print(user_movies_train_users.shape)
#user_movies_test = user_movies[336132:]

nonzero_indices = np.nonzero(user_movies_train_users)
nonzero_indices = np.array([nonzero_indices[0], nonzero_indices[1]]) #do the shuffle after this

#shuffle
np.random.shuffle(nonzero_indices.T)
count_nonzero_indices = 100
nonzero_indices = nonzero_indices[:, :count_nonzero_indices]

users = nonzero_indices[0]
movies = nonzero_indices[1]

print("Number of ratings:", count_nonzero_indices)
user_movies_train = np.zeros((count_nonzero_indices, num_users + num_movies))
#print(user_movies_train.shape)
ratings = np.zeros((count_nonzero_indices))
#print(ratings.shape)
#user_movies_log = np.empty((count_nonzero_indices, 2))
#user_movies_train[0] = np.ones((num_users + num_movies, 1))

for i in range(count_nonzero_indices):
    rating = user_movies_train_users[users[i], movies[i]]
    ratings[i] = rating
    user_vector = np.expand_dims(user_movies[users[i]], axis =1)
    #print(user_vector.shape)
    movie_vector = np.expand_dims(user_movies[:, movies[i]] , axis=1)
    #print(movie_vector.shape)
    user_movies_train[i] = np.concatenate((user_vector, movie_vector), axis=0)[:,0]
    #user_movies_log[i][0], user_movies_log[i][1] = (users[i], movies[i])
    if(i%100 == 0):
        print("completed:", i)
    
print(ratings)
toc = time.time()
print("time elapsed:",(toc - tic))

Number of ratings: 100
completed: 0
[3. 4. 5. 4. 3. 1. 4. 3. 4. 4. 2. 4. 2. 3. 3. 5. 4. 4. 5. 1. 4. 5. 3. 3.
 3. 4. 2. 4. 5. 1. 5. 4. 3. 4. 3. 1. 5. 4. 3. 4. 4. 2. 4. 3. 5. 4. 1. 3.
 2. 3. 3. 3. 4. 3. 5. 3. 5. 1. 4. 4. 4. 5. 3. 5. 4. 5. 1. 3. 2. 3. 5. 2.
 4. 5. 3. 4. 4. 3. 3. 3. 3. 3. 2. 3. 5. 4. 4. 4. 5. 3. 5. 4. 3. 3. 5. 2.
 4. 5. 4. 3.]
time elapsed: 230.9734058380127


In [0]:
#print(np.count_nonzero(user_movies_train[:,0]))
#print(user_movies_log[1])
print(nonzero_indices)
#qqqqq = nonzero_indices[0][0]
#print(user_dict[str(qqqqq)]) #do the reverse instead

[[ 6014  2558  5395  8807   616  7258  2801  3978  1027  4851  6730  2156
   8613   130  6375  9606  7576  8064  1918  2040  3104  7255   182  3283
   9822  6411  2177  5413  1985  8703  8936  1324  5189  6930  3569  2024
    475  9201   213  5148  4954  4964  5675   543   336   631  3322    96
   2466  2301  7895  1383  8360  1149  1928  2764  6407    73   650  3293
    861  2559  6688   340  1802  8692  1552  8897  8870  3322  6540  9029
   7618  4753  4453   962  4762   164  9450  2119  8962  2071  3737  3449
   3492  1301  2421  8976  9071  3329  6261  2592  2421  2416  2573    92
   1328  7473   183  1581]
 [ 1831 11278  4632  7816 14730  8595  4989 12842 16533 12434 11638  4298
  14617 14046 17096 13808 12014 15199 13613 11254  8727  8467    45 13794
   3112  4419 10893 14049  7033  7009 14868 10995 10175  6427  5205 12416
   5774  7232  5632 13301  8650  6608 14311  6971 12842 15208  5312  8049
   9420  5938  7612  5961 15840 12458  3961   482  7232 12209  6165  4224
   3637   4

## Building the model

In [0]:
def create_placeholders(n_x, n_y, m):
    
    x = tf.placeholder(tf.float32, [n_x,m])#497959
    y = tf.placeholder(tf.float32, [n_y,m])
    
    return x,y

def initialize_parameters(n_x, n_y):
    
    W1 = tf.get_variable("W1", [25, n_x], initializer = tf.contrib.layers.xavier_initializer(seed=1))
    b1 = tf.get_variable("b1", [25,1], initializer = tf.zeros_initializer())
    W2 = tf.get_variable("W2", [12, 25], initializer = tf.contrib.layers.xavier_initializer(seed=1))
    b2 = tf.get_variable("b2", [12,1], initializer = tf.zeros_initializer())
    W3 = tf.get_variable("W3", [n_y, 12], initializer = tf.contrib.layers.xavier_initializer(seed=1))
    b3 = tf.get_variable("b3", [n_y,1], initializer = tf.zeros_initializer())
    
    parameters = {"W1": W1,
                  "b1": b1,
                  "W2": W2,
                  "b2": b2,
                  "W3": W3,
                  "b3": b3}
    
    return parameters

def forward_propagation(x, parameters): 
    
    W1 = parameters["W1"]
    b1 = parameters["b1"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]
    W3 = parameters["W3"]
    b3 = parameters["b3"]
    
    Z1 = tf.add(tf.matmul(W1, x), b1)
    A1 = tf.nn.relu(Z1)
    Z2 = tf.add(tf.matmul(W2, A1), b2)
    A2 = tf.nn.relu(Z2)
    Z3 = tf.add(tf.matmul(W3, A2), b3)
    y_hat = Z3 #tf.minimum(5.0, tf.maximum(0.0, Z3))
    return y_hat

In [0]:
def stochastic_gradient_descent_model(num_epochs, training_sample_size, use_train_matrix):
    
    tf.reset_default_graph()

    x, y = create_placeholders(num_users + num_movies, 1, 1) #497959

    parameters = initialize_parameters(num_users + num_movies, 1) #497959

    y_hat = forward_propagation(x, parameters)

    #cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=tf.transpose(y_hat), labels=tf.transpose(y)))
    #print(y_hat.shape, y.shape)
    cost = tf.losses.mean_squared_error(y, y_hat)
    optimizer = tf.train.AdamOptimizer(learning_rate=0.001,
        beta1=0.9,
        beta2=0.999,
        epsilon=1e-08,
        use_locking=False,
        name='Adam').minimize(cost)

    init = tf.global_variables_initializer()

    with tf.Session() as sess:

        sess.run(init)

        for epoch in range(num_epochs):
            
            print("epoch",epoch+1)
            epoch_cost = 0
            divisor = 0
            
            if use_train_matrix == False:
                for user_row in range(0,training_sample_size):
                    #print("user", user_row + 1)
                    for movie_id, rating in enumerate(user_movies[user_row]):
                        X=[]
                        if rating>0:
                            user_matrix = user_movies[user_row]
                            movie_matrix = user_movies[:, movie_id]
                            X = np.expand_dims(np.concatenate((user_matrix, movie_matrix)), axis=1)
                            #print(X.shape)
                            Y = np.expand_dims(np.expand_dims(rating, axis=1),axis=1)
                            #print(Y)
                            _ , cost_ = sess.run([optimizer,cost], feed_dict={x:X,y:Y})
                            epoch_cost += cost_
                            divisor += 1
                            break
                            
            elif use_train_matrix == True:
                for index, sparse_vector in enumerate(user_movies_train):
                    X = np.expand_dims(sparse_vector, axis=1)
                    Y = np.expand_dims(np.expand_dims(ratings[index], axis=1), axis=1)
                    _ , cost_ = sess.run([optimizer,cost], feed_dict={x:X,y:Y})
                    epoch_cost += cost_
                    divisor += 1
                    if (index>training_sample_size):
                        break
                        #pass
            
            epoch_cost /= divisor
            print("training loss:", epoch_cost,"\n")
            
        parameters = sess.run(parameters)
        return parameters
    
def test_stochastic_gradient_descent_model(parameters, test_sample_size, show_predictions, use_train_matrix):
    
    cost = 0
    test_sample_users = np.random.randint(count_nonzero_indices - training_sample_size, size=(test_sample_size,1)) + training_sample_size
    #print(test_sample_users)
    
    for i in range(test_sample_size):
        
        j = test_sample_users[i][0]
        
        if use_train_matrix == False:
            
            user_vector = np.expand_dims(user_movies[users[j]], axis =1)
            #print(user_vector.shape)
            movie_vector = np.expand_dims(user_movies[:, movies[j]] , axis=1)
            #print(movie_vector.shape)
            X_predict = np.concatenate((user_vector, movie_vector), axis=0)
            X_predict = tf.cast(X_predict, tf.float32)
            
        elif use_train_matrix == True:
            
            X_predict = np.expand_dims(user_movies_train[j], axis=1)
            X_predict = tf.cast(X_predict, tf.float32)
            
        prediction = forward_propagation(X_predict, parameters)
        #actual_rating = user_movies_train_users[users[j], movies[j]]
        actual_rating = ratings[j]
        #actual_rating = expand_dims(actual_rating, axis=1)
        
        if show_predictions == 1:
            sess = tf.Session()
            print("prediction:", min(max(round(sess.run(prediction)[0][0]),0.0),5.0))
            sess.close()
            print("actual rating:", actual_rating,"\n")
        
        cost += tf.losses.mean_squared_error(actual_rating, prediction[0,0])
        
    cost /= test_sample_size
    return cost

In [0]:
def recommend_movies_1(user_id, parameters, search_space_size, number_of_recommendations):
    
        predictions = np.zeros(search_space_size)
        search_space_movies = np.random.randint(movies.shape[0], size=(movies.shape[0], 1))
        
        #print(search_space_movies)
        #print(movies[search_space_movies[0][0]])
        
        for i in range(search_space_size):
        
            j = search_space_movies[i][0]
            
            tic = time.time()
            user_vector = np.expand_dims(user_movies[user_dict[user_id]], axis =1)
            movie_vector = np.expand_dims(user_movies[:, movies[j]] , axis = 1)
            X_predict = np.concatenate((user_vector, movie_vector), axis=0)
            X_predict = tf.cast(X_predict, tf.float32)
            #print("Time elapsed for predicting:",time.time() - tic)
            
            tic = time.time()
            prediction = tf.Session().run(forward_propagation(X_predict, parameters))
            #print("Time elapsed for calculating:",time.time() - tic)
            
            predictions[i] = prediction
            
        i_max = np.argmax(predictions) 
        j_max = search_space_movies[i][0]    
        return movies[j]

In [0]:
#train
training_sample_size = 90
tic = time.time()
parameters = stochastic_gradient_descent_model(num_epochs = 20, training_sample_size = training_sample_size, use_train_matrix = True)
toc = time.time()
print("time elapsed:",(toc - tic))

epoch 1




training loss: 2039.1399489683706 

epoch 2
training loss: 3352.207876700746 

epoch 3
training loss: 3652.7607915136305 

epoch 4
training loss: 200.2159911255604 

epoch 5
training loss: 64.04948830073599 

epoch 6
training loss: 37.56620274251066 

epoch 7
training loss: 35.260375770598486 

epoch 8
training loss: 31.865332347461816 

epoch 9
training loss: 35.967080878103964 

epoch 10
training loss: 47.07712229557776 

epoch 11
training loss: 21.960863165056292 

epoch 12
training loss: 26.492141803311508 

epoch 13
training loss: 17.564287356991805 

epoch 14
training loss: 13.240431679787276 

epoch 15
training loss: 20.33603730859961 

epoch 16
training loss: 18.19219016924541 

epoch 17
training loss: 13.682747877825813 

epoch 18
training loss: 10.981376668265591 

epoch 19
training loss: 10.256042455854502 

epoch 20
training loss: 13.898232973692085 

time elapsed: 87.01115703582764


In [0]:
#test
cost = test_stochastic_gradient_descent_model(parameters, test_sample_size = 10, show_predictions = True, use_train_matrix = True)
sess = tf.Session()
print("test loss:", sess.run(cost))
sess.close()

prediction: 1.0
actual rating: 3.0 

prediction: 3.0
actual rating: 4.0 

prediction: 4.0
actual rating: 5.0 

prediction: 4.0
actual rating: 5.0 

prediction: 2.0
actual rating: 5.0 

prediction: 2.0
actual rating: 3.0 

prediction: 1.0
actual rating: 3.0 

prediction: 2.0
actual rating: 2.0 

prediction: 1.0
actual rating: 3.0 

prediction: 4.0
actual rating: 5.0 

test loss: 2.7258148


In [0]:
#load movie titles
titles_file = open("dataset/movie_titles.txt", encoding = "ISO-8859-1")
movie_info = titles_file.read().split("\n")
movie_ids_titles = {}
for i in range(len(movie_info)):
    info_split = movie_info[i].split(",")
    #print(info_split[2])
    movie_ids_titles[str(info_split[0])] = info_split[2]

In [0]:
#recommend movie
#tic = time.time()
user_s_id = "1488844"
recommended_movie_id = recommend_movies_1(user_s_id, parameters, search_space_size = 10, number_of_recommendations = 1)
print("Recommended movie: ",movie_ids_titles[str(recommended_movie_id)])

#print("Time elapsed:",time.time() - tic)

Recommended movie:  Babe: Pig in the City


In [0]:
DON'T RUN
#predict
user_id = 1025579
movie_id = 1

X_predict = np.expand_dims(np.concatenate((user_movies[user_dict[str(user_id)]], user_movies[:, movie_id - 1])), axis=1)
X_predict = tf.cast(X_predict, tf.float32)
prediction = forward_propagation(X_predict, parameters)

sess = tf.Session()
predicted_rating = sess.run(prediction)[0,0]
actual_rating = user_movies[user_dict[str(user_id)], movie_id - 1]
print("predicted rating:", predicted_rating)
print("actual rating:", actual_rating)
print("cost:", sess.run(tf.losses.mean_squared_error(actual_rating, predicted_rating)))
sess.close()

In [0]:
def minibatch_gradient_descent_model(num_epochs, training_sample_size, minibatch_size, show_ratings):
    
    tf.reset_default_graph()

    x, y = create_placeholders(num_users + num_movies, 1, minibatch_size)

    parameters = initialize_parameters(num_users + num_movies, 1)

    y_hat = forward_propagation(x, parameters)

    #cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=tf.transpose(y_hat), labels=tf.transpose(y)))
    #print(y_hat.shape, y.shape)
    cost = tf.losses.mean_squared_error(y, y_hat)
    optimizer = tf.train.AdamOptimizer(learning_rate=0.001,
        beta1=0.9,
        beta2=0.999,
        epsilon=1e-08,
        use_locking=False,
        name='Adam').minimize(cost)

    init = tf.global_variables_initializer()

    with tf.Session() as sess:

        sess.run(init)
        
        movie_id = np.random.randint(0,user_movies.shape[1] - minibatch_size)
        for epoch in range(num_epochs):
            print("\nepoch",epoch+1)
            epoch_cost = 0
            num_of_samples = 0
            user_id_start = np.random.randint(0, 100)
            for user_row in range(user_id_start, user_id_start + training_sample_size, minibatch_size):
                #print("user", user_row + 1)
                movie_matrix = np.transpose(user_movies[:, movie_id:movie_id + minibatch_size])
                user_matrix = user_movies[user_row,:]
                user_matrix = np.multiply(user_matrix, np.ones((minibatch_size, user_matrix.shape[0])))
                #print(movie_matrix.shape)
                #print(user_matrix.shape, movie_matrix.shape)
                X = np.concatenate((user_matrix, movie_matrix), axis=1).T
                #print(X.shape)
                #Y = np.expand_dims(np.expand_dims(rating, axis=1),axis=1) #do the diagonal
                Y = np.expand_dims(user_movies[user_row, movie_id:movie_id + minibatch_size], axis=0)
                if (show_ratings == True):
                    print(Y)
                _ , cost_ = sess.run([optimizer,cost], feed_dict={x:X,y:Y})
                epoch_cost += cost_
                num_of_samples += 1
            print("cost:", epoch_cost/num_of_samples)
        parameters = sess.run(parameters)
        return parameters

In [0]:
tic = time.time()
parameters = minibatch_gradient_descent_model(num_epochs = 5, training_sample_size=500, minibatch_size = 100, show_ratings = False)
toc = time.time()
print("\ntime elapsed:",(toc - tic))


epoch 1
cost: 891.6318532526493

epoch 2
cost: 16.20165205001831

epoch 3
cost: 9.482336139678955

epoch 4
cost: 8.098219299316407

epoch 5
cost: 4.119643640518189

time elapsed: 90.39139199256897


In [0]:
def recommend_movies_2(user_id, parameters, minibatch_size):
    
    user_row = user_dict[user_id]
    init = tf.global_variables_initializer()
    
    with tf.Session() as sess:

        sess.run(init)
    
        movie_id = np.random.randint(0,user_movies.shape[1] - minibatch_size)
        movie_matrix = np.transpose(user_movies[:, movie_id:movie_id + minibatch_size])
        user_matrix = user_movies[user_row,:]
        user_matrix = np.multiply(user_matrix, np.ones((minibatch_size, user_matrix.shape[0])))

        X = np.concatenate((user_matrix, movie_matrix), axis=1).T
        X = tf.cast(X, tf.float32)
        
        prediction = tf.Session().run(forward_propagation(X, parameters))

        return prediction, movie_id

In [0]:
#recommend movie
user_s_id = "1488844"
prediction, movie_id_start = recommend_movies_2(user_id = user_s_id, parameters = parameters, minibatch_size = 50)
recommended_movie_id = movie_id_start + np.argmax(prediction)
print(prediction)
print("Recommended movie: ",movie_ids_titles[str(recommended_movie_id)])

[[-2.0534500e-02  4.7625193e-01  2.0013235e+00  1.1377928e+00
   4.2394650e-01  1.7522650e+00  2.3994789e+00  6.2599564e-01
   1.4822948e+01  2.0324595e+00  8.1194982e-02  1.9309703e+00
   1.9998772e-01 -3.0098987e-01 -3.9502397e-02  1.1360940e+00
   1.3193545e+01  1.2176486e+00 -3.4572620e-02 -5.4923404e-02
   1.6793445e+00 -1.2143776e-01  3.0449943e-03 -3.2478757e-02
   5.3272138e+00  1.6025283e+00  1.2275100e+00  1.9188828e+00
   1.3296590e+00  2.4529421e-01  2.0497715e+00  6.9647579e+00
   1.2743345e+00  2.0618563e+01  2.1756930e-02  1.2452878e+00
   9.5784432e-01  1.8110152e+00  1.7485120e+00  1.7188232e+00
   1.3365153e+00  1.7090050e+00  1.9848616e+00  7.1905441e+00
   2.4056156e+00 -6.0333703e-03  1.3115401e+00 -2.3484262e-02
   1.1351225e+00  1.4826652e+00]]
Recommended movie:  Rainbow Brite and the Star Stealer


In [0]:
DON'T RUN
user_id = 321111
movie_id = 2

X_test = np.expand_dims(np.concatenate((user_movies[user_dict[str(user_id)]], user_movies[:, movie_id - 1])), axis=1)
X_test = tf.cast(X_test, tf.float32)
print(X_test)
prediction = forward_propagation(X_test, parameters)
sess = tf.Session()
print(np.math.floor(sess.run(prediction)))

Tensor("Cast:0", shape=(497959, 1), dtype=float32)
-1
