In [1]:
import numpy as np
import tensorflow.compat.v1 as tf
# disable tensorflow warnings
tf.logging.set_verbosity(tf.logging.ERROR)
# use tensorflow v1
tf.disable_v2_behavior()


class RBM(object):
    def __init__(self, num_v, id, num_h, batch_size, learning_rate,
                 num_epoch, k=2):
        self.num_v = num_v
        self.num_h = num_h
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.num_epoch = num_epoch
        self.k = k
        self.id = id
        self.W, self.a, self.b = self._init_parameter(id)

    def _init_parameter(self, id):
        abs_val = np.sqrt(2.0 / (self.num_h + self.num_v))
        with tf.variable_scope('rbm{}_parameter'.format(id)):
            W = tf.get_variable('weights', shape=(self.num_v, self.num_h),
                                initializer=tf.random_uniform_initializer(
                minval=-abs_val, maxval=abs_val))
            a = tf.get_variable('visible_bias', shape=(self.num_v),
                                initializer=tf.zeros_initializer())
            b = tf.get_variable('hidden_bias', shape=(self.num_h),
                                initializer=tf.zeros_initializer())
        return W, a, b

    def _gibbs_sampling(self, v):
        v0 = v
        prob_h_v0 = self._prob_h_given_v(v0)
        vk = v
        prob_h_vk = prob_h_v0
        for _ in range(self.k):
            hk = self._bernoulli_sampling(prob_h_vk)
            prob_v_hk = self._prob_v_given_h(hk)
            vk_tmp = prob_v_hk
            vk = tf.where(tf.equal(v0, 0.0), v0, vk_tmp)
            prob_h_vk = self._prob_h_given_v(vk)
        return v0, prob_h_v0, vk, prob_h_vk

    def _prob_v_given_h(self, h):
        return tf.sigmoid(
            tf.add(self.a, tf.matmul(h, tf.transpose(self.W))))

    def _prob_h_given_v(self, v):
        return tf.sigmoid(tf.add(self.b, tf.matmul(v, self.W)))

    def _bernoulli_sampling(self, prob):
        distribution = tf.distributions.Bernoulli(
            probs=prob, dtype=tf.float32)
        return tf.cast(distribution.sample(), tf.float32)

    def _compute_gradients(self, v0, prob_h_v0, vk, prob_h_vk):
        outer_product0 = tf.matmul(tf.transpose(v0), prob_h_v0)
        outer_productk = tf.matmul(tf.transpose(vk), prob_h_vk)
        W_grad = tf.reduce_mean(outer_product0 - outer_productk, axis=0)
        a_grad = tf.reduce_mean(v0 - vk, axis=0)
        b_grad = tf.reduce_mean(prob_h_v0 - prob_h_vk, axis=0)
        return W_grad, a_grad, b_grad

    def _optimize(self, v):
        v0, prob_h_v0, vk, prob_h_vk = self._gibbs_sampling(v)
        W_grad, a_grad, b_grad = self._compute_gradients(
            v0, prob_h_v0, vk, prob_h_vk)
        para_update = [tf.assign(self.W, tf.add(self.W, self.learning_rate*W_grad)),
                       tf.assign(self.a, tf.add(
                           self.a, self.learning_rate*a_grad)),
                       tf.assign(self.b, tf.add(self.b, self.learning_rate*b_grad))]
        bool_mask = tf.cast(tf.where(tf.equal(v0, 0.0), x=tf.zeros_like(
            v0), y=tf.ones_like(v0)), dtype=tf.bool)
        # mask the zero values because they are not included in the error calculation
        v0_mask = tf.boolean_mask(v0, bool_mask)
        vk_mask = tf.boolean_mask(vk, bool_mask)
        error = tf.metrics.mean_squared_error(v0_mask, vk_mask)[1]
        return para_update, error

    def train(self, X_train):
        X_train_plac = tf.placeholder(tf.float32, [None, self.num_v])
        para_update, error = self._optimize(X_train_plac)
        init = tf.group(tf.global_variables_initializer(),
                        tf.local_variables_initializer())
        with tf.Session() as sess:
            sess.run(init)
            epochs_err = []
            n_batch = int(X_train.shape[0] / self.batch_size)
            for epoch in range(1, self.num_epoch + 1):
                epoch_err_sum = 0
                for batch_number in range(n_batch):
                    batch = X_train[batch_number * self.batch_size:
                                    (batch_number + 1) * self.batch_size]
                    parameters, batch_err = sess.run((para_update, error), feed_dict={
                        X_train_plac: batch})
                    epoch_err_sum += batch_err
                epochs_err.append(epoch_err_sum / n_batch)
                if epoch % 10 == 0:
                    print("Training error at epoch %s: %s" %
                          (epoch, epochs_err[-1]))
        return parameters

    def predict(self, v, parameters):
        W, a, b = parameters
        prob_h_v = 1 / (1 + np.exp(-(b + np.matmul(v, W))))
        h = np.random.binomial(1, p=prob_h_v)
        prob_v_h = 1 / (1 + np.exp(-(a + np.matmul(h, np.transpose(W)))))
        return prob_v_h

    def hidden_layer(self, v, parameters):
        W, a, b = parameters
        h = 1 / (1 + np.exp(-(b + np.matmul(v, W))))
        return h

In [2]:
class DBN(object):
    def __init__(self, layer_sizes, batch_size, learning_rates, num_epoch, k=2):
        self.rbms = []
        for i in range(1, len(layer_sizes)):
            rbm = RBM(num_v=layer_sizes[i-1], id=i, num_h=layer_sizes[i], batch_size=batch_size,
                      learning_rate=learning_rates[i-1], num_epoch=num_epoch, k=k)
            self.rbms.append(rbm)

    def train(self, X_train):
        self.rbms_para = []
        input_data = None
        for rbm in self.rbms:
            if input_data is None:
                input_data = X_train.copy()
            parameters = rbm.train(input_data)
            self.rbms_para.append(parameters)
            input_data = rbm.hidden_layer(input_data, parameters)

    def predict(self, X):
        data = None
        for rbm, parameters in zip(self.rbms, self.rbms_para):
            if data is None:
                data = X.copy()
            # fix for one layer DBN
            if len(self.rbms) == 1 and rbm.id == len(self.rbms):
                return rbm.predict(data, parameters)
            data = rbm.hidden_layer(data, parameters)
        return data


In [3]:
from sklearn.metrics import mean_squared_error
import numpy as np
# disable warnings
import warnings
warnings.filterwarnings("ignore")

data_path = '../ml-1m/ratings.dat'
num_users = 6040
num_movies = 3706
data = np.zeros([num_users, num_movies], dtype=np.float32)
movie_dict = {}
with open(data_path, 'r') as file:
    for line in file.readlines()[1:]:
        user_id, movie_id, rating, _ = line.split("::")
        user_id = int(user_id) - 1
        if movie_id not in movie_dict:
            movie_dict[movie_id] = len(movie_dict)
        rating = float(rating) / 5
        data[user_id, movie_dict[movie_id]] = rating
data = np.reshape(data, [data.shape[0], -1])
values, counts = np.unique(data, return_counts=True)
for value, count in zip(values, counts):
    print(f'Number of {int(value*5)} stars: {count}')


Number of 0 stars: 21384032
Number of 1 stars: 56174
Number of 2 stars: 107557
Number of 3 stars: 261197
Number of 4 stars: 348971
Number of 5 stars: 226309


In [4]:
np.random.seed(1)
np.random.shuffle(data)
num_train = int(0.9 * data.shape[0])
data_train, data_test = data[:num_train, :], data[num_train:, :]
sim_index = np.zeros_like(data_test, dtype=bool)
perc_sim = 0.2
for i, user_test in enumerate(data_test):
    exist_index = np.where(user_test > 0.0)[0]
    sim_index[i, np.random.choice(exist_index,
                                  int(len(exist_index)*perc_sim))] = True


dbn = DBN(layer_sizes=[num_movies, 80], batch_size=64, num_epoch=100, learning_rates=[0.1], k=5)
dbn.train(data_train)
data_test_sim = np.copy(data_test)
data_test_sim[sim_index] = 0.0

prediction = dbn.predict(data_test_sim)
print(mean_squared_error(data_test[sim_index], prediction[sim_index]))

2024-06-03 01:07:00.486969: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:388] MLIR V1 optimization pass is not enabled


Training error at epoch 10: 0.04473554427247672
Training error at epoch 20: 0.04221944697201252
Training error at epoch 30: 0.04116101199317546
Training error at epoch 40: 0.040564773249484244
Training error at epoch 50: 0.04017959224681059
Training error at epoch 60: 0.0399079890034738
Training error at epoch 70: 0.03970624453255108
Training error at epoch 80: 0.03954996665318807
Training error at epoch 90: 0.03942516335241851
Training error at epoch 100: 0.03932305851152965
0.03795511475475648


In [5]:
# Movie recommendation
# The recommender receive the user inputs including User ID, Movie ID, Rating: a rating from 1-5 stars for a specific movie
# after that, the model should give out the recommendation of the movies that the user might like.
# Besides, the user can give some filter to the recommendation, such as the genre of the movie, the year of the movie, etc.

# load movie data
movie_path = '../ml-1m/movies.dat'
movie_data = {}

with open(movie_path, 'r', encoding='latin-1') as file:
    for line in file.readlines():
        line = line.strip()
        movie_id, title, genre = line.split("::")
        year = title[-5:-1] if title[-1] == ')' else None
        movie_data[movie_id] = {
            'title': title,
            'genre': genre.split('|'),
            'year': year
        }

from collections import deque

class Recommender:
    def __init__(self):
        self.rating = np.zeros([num_movies], dtype=np.float32)

    def input_rating(self, movie_id, rating):
        self.rating[movie_dict[movie_id]] = float(rating) / 5

    def get_recommendation(self, query=None):
        index_movie = {value: key for key, value in movie_dict.items()}
        prediction = dbn.predict(np.reshape(self.rating, [1, -1]))
        watched = np.where(self.rating != 0)[0]
        low_index = np.where(prediction[0] >= 0.6)[0] # 3 to 4 stars
        med_index = np.where(prediction[0] >= 0.8)[0] # 4 to 4.5 stars
        high_index = np.where(prediction[0] >= 0.9)[0] # 4.5 to 5 stars
        # print('Movies watched:',', '.join(movie_data[index_movie[index]]['title'] for index in watched))
        # print('Movies with low prediction:', ', '.join(movie_data[index_movie[index]]['title'] for index in low_index if index not in watched and index not in med_index and index not in high_index))
        # print('Movies with med prediction:', ', '.join(movie_data[index_movie[index]]['title'] for index in med_index if index not in watched and index not in high_index))
        # print('Movies with high prediction:', ', '.join(movie_data[index_movie[index]]['title'] for index in high_index if index not in watched))
        
        set_watched = set(watched)
        low_index = set(low_index) - set_watched
        med_index = set(med_index) - set_watched
        high_index = set(high_index) - set_watched
        recommendation = deque()
        for i in high_index:
            recommendation.append(i)
        for i in med_index:
            recommendation.append(i)
        for i in low_index:
            recommendation.append(i)
        
        # filter the recommendation
        if not query:
            final_recommendation = []
            count = 5
            while count > 0 and recommendation:
                final_recommendation.append(recommendation.popleft())
                count -= 1
        else:
            final_recommendation = []
            count = 5
            while count > 0 and recommendation:
                idx = recommendation.popleft()
                # print(movie_data[index_movie[idx]])
                if 'genre' in query and query['genre'] not in movie_data[index_movie[idx]]['genre']:
                    continue
                if 'year' in query and query['year'] != movie_data[index_movie[idx]]['year']:
                    continue
                if 'rating' in query and prediction[0][idx]*5 >= query['rating'] and prediction[0][idx]*5 < query['rating']+1:
                    continue
                final_recommendation.append(idx)
                count -= 1
        
        return [movie_data[index_movie[index]]['title'] for index in final_recommendation]


recommender = Recommender()
user_1_ratings = {'1193': 5, 
                '661': 3,
                '914': 3,
                '3408': 4,
                '2355': 5,
                '1197': 3,
                '1287': 5,
                '2804': 5,
                '594': 4,
                '919': 4,
                '595': 5,
                '938': 4,
                '2398': 4,
                }
for movie_id, rating in user_1_ratings.items():
    recommender.input_rating(movie_id, rating)
print(recommender.get_recommendation())
print(recommender.get_recommendation(query={'genre': 'Crime'}))
print(recommender.get_recommendation(query={'year': '1999'}))
print(recommender.get_recommendation(query={'rating': 4}))

['Shawshank Redemption, The (1994)', 'Shall We Dance? (1937)', 'Patton (1970)', '42 Up (1998)', 'Killing Fields, The (1984)']
['Usual Suspects, The (1995)', 'Godfather, The (1972)', 'Godfather: Part II, The (1974)', 'M (1931)', 'Double Indemnity (1944)']
['Sixth Sense, The (1999)', 'American Beauty (1999)', 'Matrix, The (1999)', 'Tarzan (1999)', 'Girl, Interrupted (1999)']
["Ferris Bueller's Day Off (1986)", 'Sound of Music, The (1965)', 'Airplane! (1980)', 'Tarzan (1999)', 'Bambi (1942)']
