In [None]:
from collections import defaultdict
import numpy as np
import os, random
import pandas as pd
import scipy
from scipy import sparse

dataset_path = "/Users/a0m02fp/Downloads/ml-20m"

ratings_df = pd.read_csv(os.path.join(dataset_path, "ratings.csv"), encoding="utf-8", sep=",")

user_id, movie_id, ratings = list(ratings_df[u'userId']), list(ratings_df[u'movieId']), list(ratings_df[u'rating'])

uid_mid_pairs = zip(user_id, movie_id, ratings)

In [None]:
uid_map = dict()

user_ids = sorted(list(set(user_id)))

for idx in range(len(user_ids)):
    uid_map[user_ids[idx]] = idx

In [None]:
movies_df = pd.read_csv(os.path.join(dataset_path, "movies.csv"), encoding="utf-8", sep=",")

movie_ids, mid_titles = list(movies_df[u'movieId']), list(movies_df[u'title'])

mid_to_title_map = dict()

mid_map, mid_reverse_map = dict(), dict()

for mid, title in zip(movie_ids, mid_titles):
    mid_to_title_map[mid] = title
    
for idx in range(len(movie_ids)):
    mid_reverse_map[idx] = movie_ids[idx]
    mid_map[movie_ids[idx]] = idx

In [None]:
del(ratings_df)
del(user_id)
del(movie_id)
del(ratings)
del(user_ids)
del(movies_df)
del(movie_ids)
del(mid_titles)

In [None]:
for idx in range(len(uid_mid_pairs)):
    uid, mid, rating = uid_mid_pairs[idx]
    uid_mid_pairs[idx] = (uid_map[uid], mid_map[mid], rating)

In [None]:
mean_rating = np.mean([rating for _, _, rating in uid_mid_pairs])

In [None]:
uids, mids, ratings = map(list, zip(*uid_mid_pairs))
ratings_matrix = sparse.csr_matrix((ratings, (uids, mids)), shape=(len(uid_map), len(mid_map)))

del(uids)
del(mids)
del(ratings)

In [None]:
n, m = ratings_matrix.shape
latent_dim = 64

In [None]:
weights1 = np.full((m, m), 1.0/m)
weights1_m, weights1_v = np.zeros((m, m)), np.zeros((m, m))

In [None]:
weights2 = np.full((m, m), 1.0/m)
weights2_m, weights2_v = np.zeros((m, m)), np.zeros((m, m))

In [None]:
weights_y = np.random.normal(0.0, 0.001, m * latent_dim).reshape((m, latent_dim))
weights_ym, weights_yv = np.zeros((m, latent_dim)), np.zeros((m, latent_dim))

In [None]:
p = np.random.normal(0.0, 0.001, n * latent_dim).reshape((n, latent_dim))
q = np.random.normal(0.0, 0.001, m * latent_dim).reshape((m, latent_dim))

pm, qm = np.zeros((n, latent_dim)), np.zeros((m, latent_dim))
pv, qv = np.zeros((n, latent_dim)), np.zeros((m, latent_dim))

In [None]:
bias_u, bias_m = np.zeros(n), np.zeros(m)
    
b1m, b2m = np.zeros(n), np.zeros(m)
b1v, b2v = np.zeros(n), np.zeros(m)

In [None]:
random.shuffle(uid_mid_pairs)
validation_data, training_data = uid_mid_pairs[:5000], uid_mid_pairs[5000:]

In [None]:
del(uid_mid_pairs)

In [None]:
num_ratings = np.bincount(ratings_matrix.nonzero()[0])
normalizations = 1.0/np.sqrt(num_ratings + 0.001)

In [None]:
implicit = sparse.csr_matrix((ratings_matrix != 0).astype(int).T.multiply(normalizations).T)

In [None]:
def get_neighborhood_diffs(ratings_matrix, bias_u, bias_m, mean_rating, mydata, normalizations, implicit):
    u_idx, m_idx, _ = map(list, zip(*mydata))
    
    ratings = ratings_matrix[u_idx]
    baselines = np.add.outer(bias_u[u_idx], bias_m) + mean_rating
    
    diff1 = sparse.csr_matrix((ratings.data - baselines[ratings.nonzero()], ratings.nonzero()), shape=baselines.shape)
    diff1 = diff1.T.multiply(normalizations[u_idx]).T
    
    diff2 = implicit[u_idx]
    
    return diff1, diff2

def get_neighborhood_scores(ratings_matrix, weights1, weights2, bias_u, bias_m, mean_rating, mydata, normalizations, implicit):
    u_idx, m_idx, _ = map(list, zip(*mydata))
    
    diff1, diff2 = get_neighborhood_diffs(ratings_matrix, bias_u, bias_m, mean_rating, mydata, normalizations, implicit)
    
    a = diff1.multiply(weights1[m_idx]).sum(axis=1)
    
    b = diff1.multiply(weights2[m_idx]).sum(axis=1)
    
    return np.squeeze(np.asarray(a + b))

def get_latent_neighborhood_scores(ratings_matrix, weights_y, mydata, normalizations, implicit):
    u_idx, m_idx, _ = map(list, zip(*mydata))
    
    return implicit[u_idx].dot(weights_y)

def get_ratings_errors(ratings_matrix, p, q, weights1, weights2, weights_y, bias_u, bias_m, mean_rating, mydata, normalizations, implicit):
    u_idx, m_idx, true_ratings = map(list, zip(*mydata))
    
    scores = get_neighborhood_scores(ratings_matrix, weights1, weights2, bias_u, bias_m, mean_rating, mydata, normalizations, implicit)
    latent_nscores = get_latent_neighborhood_scores(ratings_matrix, weights_y, mydata, normalizations, implicit)
    
    preds = np.sum((p[u_idx] + latent_nscores) * q[m_idx], axis=1) + bias_u[u_idx] + bias_m[m_idx] + mean_rating + scores
    
    return true_ratings - preds

In [None]:
eta, lambdas = 0.001, 0.1
beta1, beta2 = 0.9, 0.999
eps = 1e-8

batch_size = 32

num_iter, losses, last_k_losses = 0, [], []

In [None]:
while True:
    num_iter += 1

    if num_iter % 1000 == 0:
        errs_validation = get_ratings_errors(ratings_matrix, p, q, weights1, weights2, weights_y, bias_u, bias_m, mean_rating, validation_data, normalizations, implicit)
        rmse_loss = np.sqrt(np.mean(errs_validation**2))

        losses.append(rmse_loss)

        print rmse_loss

        if rmse_loss < 0.5:
            break
    
    selected_pairs = random.sample(training_data, batch_size)

    errs_train = get_ratings_errors(ratings_matrix, p, q, weights1, weights2, weights_y, bias_u, bias_m, mean_rating, selected_pairs, normalizations, implicit)

    u_idx, m_idx, _ = map(list, zip(*selected_pairs))

    x, y = bias_u[u_idx], bias_m[m_idx]
    
    u1, v1 = b1m[u_idx], b1v[u_idx]
    u2, v2 = b2m[m_idx], b2v[m_idx]

    grad1, grad2 = -(errs_train - lambdas * x), -(errs_train - lambdas * y)
    
    u1 = beta1 * u1 + (1 - beta1) * grad1
    v1 = beta2 * v1 + (1 - beta2) * (grad1**2)
    
    x += -eta * u1/(np.sqrt(v1) + eps)
    
    u2 = beta1 * u2 + (1 - beta1) * grad2
    v2 = beta2 * v2 + (1 - beta2) * (grad2**2)
    
    y += -eta * u2/(np.sqrt(v2) + eps)
    
    bias_u[u_idx], bias_m[m_idx], b1m[u_idx], b1v[u_idx], b2m[m_idx], b2v[m_idx] = x, y, u1, v1, u2, v2
    
    
    diff1, diff2 = get_neighborhood_diffs(ratings_matrix, bias_u, bias_m, mean_rating, selected_pairs, normalizations, implicit)
    
    x, y = weights1[m_idx], weights2[m_idx]
    
    u1, v1 = weights1_m[m_idx], weights1_v[m_idx]
    u2, v2 = weights2_m[m_idx], weights2_v[m_idx]
    
    z1, z2 = np.array(diff1.T.multiply(errs_train).T.todense()), np.array(diff2.T.multiply(errs_train).T.todense())
    
    grad1, grad2 = -(z1 - lambdas * x), -(z2 - lambdas * y)
    
    u1 = beta1 * u1 + (1 - beta1) * grad1
    v1 = beta2 * v1 + (1 - beta2) * (grad1**2)
    
    x += -eta * u1/(np.sqrt(v1) + eps)
    
    u2 = beta1 * u2 + (1 - beta1) * grad2
    v2 = beta2 * v2 + (1 - beta2) * (grad2**2)
    
    y += -eta * u2/(np.sqrt(v2) + eps)
    
    weights1[m_idx], weights2[m_idx], weights1_m[m_idx], weights1_v[m_idx], weights2_m[m_idx], weights2_v[m_idx] = x, y, u1, v1, u2, v2
    

    x, y = p[u_idx], q[m_idx]
    neighborhood_scores = get_latent_neighborhood_scores(ratings_matrix, weights_y, selected_pairs, normalizations, implicit)
    
    u1, v1 = pm[u_idx], pv[u_idx]
    u2, v2 = qm[m_idx], qv[m_idx]

    z1, z2 = np.multiply(y.T, errs_train).T, np.multiply((x + neighborhood_scores).T, errs_train).T
    
    grad1, grad2 = -(z1 - lambdas * x), -(z2 - lambdas * y)

    u1 = beta1 * u1 + (1 - beta1) * grad1
    v1 = beta2 * v1 + (1 - beta2) * (grad1**2)

    x += -eta * u1/(np.sqrt(v1) + eps)

    u2 = beta1 * u2 + (1 - beta1) * grad2
    v2 = beta2 * v2 + (1 - beta2) * (grad2**2)

    y += -eta * u2/(np.sqrt(v2) + eps)
    
    p[u_idx], q[m_idx], pm[u_idx], pv[u_idx], qm[m_idx], qv[m_idx] = x, y, u1, v1, u2, v2
    
    
    x = weights_y[m_idx]
    
    u1, v1 = weights_ym[m_idx], weights_yv[m_idx]
    
    latents = np.multiply(q[m_idx].T, normalizations[u_idx]).T
    
    z1 = np.multiply(latents.T, errs_train).T
    grad1 = -(z1 - lambdas * x)
    
    u1 = beta1 * u1 + (1 - beta1) * grad1
    v1 = beta2 * v1 + (1 - beta2) * (grad1**2)
    
    x += -eta * u1/(np.sqrt(v1) + eps)
    
    weights_y[m_idx], weights_ym[m_idx], weights_yv[m_idx] = x, u1, v1

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

plt.plot(losses[:340])
plt.xlabel("Number of epochs")
plt.ylabel("RMSE Loss on validation data")
plt.show()