In [None]:
from collections import defaultdict
import numpy as np
import os, random
import pandas as pd
import scipy
from scipy import sparse

dataset_path = "/Users/a0m02fp/Downloads/ml-20m"

ratings_df = pd.read_csv(os.path.join(dataset_path, "ratings.csv"), encoding="utf-8", sep=",", nrows=10000)

user_id, movie_id, ratings = list(ratings_df[u'userId']), list(ratings_df[u'movieId']), list(ratings_df[u'rating'])

uid_mid_pairs = zip(user_id, movie_id, ratings)

In [None]:
uid_map = dict()

user_ids = sorted(list(set(user_id)))

n_users = len(user_ids)

for idx in range(len(user_ids)):
    uid_map[user_ids[idx]] = idx

In [None]:
movies_df = pd.read_csv(os.path.join(dataset_path, "movies.csv"), encoding="utf-8", sep=",")

movie_ids, mid_titles = list(movies_df[u'movieId']), list(movies_df[u'title'])

n_movies = len(movie_ids)

mid_to_title_map = dict()

mid_map, mid_reverse_map = dict(), dict()

for mid, title in zip(movie_ids, mid_titles):
    mid_to_title_map[mid] = title
    
for idx in range(len(movie_ids)):
    mid_reverse_map[idx] = movie_ids[idx]
    mid_map[movie_ids[idx]] = idx

In [None]:
for idx in range(len(uid_mid_pairs)):
    uid, mid, rating = uid_mid_pairs[idx]
    uid_mid_pairs[idx] = (uid_map[uid], mid_map[mid], rating)

In [None]:
uids, mids, ratings = map(list, zip(*uid_mid_pairs))
ratings_matrix = sparse.csr_matrix((ratings, (uids, mids)), shape=(n_users, n_movies))

In [None]:
from scipy.sparse import csr_matrix

uid_mat = csr_matrix(([], ([], [])), shape=(len(uid_mid_pairs), n_users))
mid_mat = csr_matrix(([], ([], [])), shape=(len(uid_mid_pairs), n_movies))

uid_mat[range(len(uid_mid_pairs)), uids] = 1
mid_mat[range(len(uid_mid_pairs)), mids] = 1

In [None]:
tags_df = pd.read_csv(os.path.join(dataset_path, "tags.csv"), encoding="utf-8", sep=",")

In [None]:
genres_df = pd.read_csv(os.path.join(dataset_path, "movies.csv"), encoding="utf-8", sep=",")

In [None]:
import re
import sys
stdout = sys.stdout
reload(sys)
sys.setdefaultencoding('utf-8')
sys.stdout = stdout

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

movie_id, tags = list(tags_df[u'movieId']), list(tags_df[u'tag'])

tags = [str(tag) for tag in tags]

movie_tag_map = defaultdict(list)

for idx in range(len(movie_id)):
    tag = tags[idx].lower()
    tag = re.sub("[^a-zA-Z0-9 ]", " ", tag)
    tag = tag.strip()
    tag = re.sub("\s+", " ", tag)
    
    if len(tag) > 0:
        tag_words = tag.split(" ")
        tag = " ".join([x for x in tag_words if x not in stop_words])
        
        movie_tag_map[mid_map[movie_id[idx]]].append(tag)
            
movie_id, genres = list(genres_df[u'movieId']), list(genres_df[u'genres'])

for idx in range(len(movie_id)):
    genre = genres[idx].lower()
    all_genres = genre.split("|")
    
    for gen in all_genres:
        movie_tag_map[mid_map[movie_id[idx]]].append(gen)

movie_tags = []

for mid in range(n_movies):
    movie_tags.append("$$$".join(movie_tag_map[mid]))

In [None]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(tokenizer=lambda sent: sent.split("$$$"), ngram_range=(1,1), stop_words='english')
movie_tag_mat = vectorizer.fit_transform(movie_tags)

In [None]:
tag_mat = movie_tag_mat[mids,:]

In [None]:
implicit = sparse.csr_matrix((ratings_matrix != 0).astype(int))
imp_mat = implicit[uids,:]

In [None]:
from scipy.sparse import hstack
mat = hstack((uid_mat, mid_mat, tag_mat, imp_mat))

In [None]:
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor

X_train, X_test, y_train, y_test = train_test_split(mat, ratings, test_size=0.2)

In [None]:
model = SVR(kernel='linear')
model.fit(X_train, y_train)

In [None]:
preds = model.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error
print mean_squared_error(y_test, preds)

In [None]:
weight_0 = 0.0
weight_0_m, weight_0_v = 0.0, 0.0

In [None]:
weight_k = np.zeros(mat.shape[1])
weight_k_m, weight_k_v = np.zeros(mat.shape[1]), np.zeros(mat.shape[1])

In [None]:
k = 64
factors = np.zeros((mat.shape[1], k))
factors_m, factors_v = np.zeros((mat.shape[1], k)), np.zeros((mat.shape[1], k))

In [None]:
import random

def get_predictions(selected_data, weight_0, weight_k, factors):
    x = selected_data.dot(factors)
    y = selected_data.power(2).dot((factors**2))
    
    z = 0.5 * ((x**2) - y)
    return weight_0 + np.squeeze(np.asarray(selected_data.multiply(weight_k).sum(axis=1))) + z.sum(axis=1)

def get_errors(selected_data, weight_0, weight_k, factors, true_labels):
    preds = get_predictions(selected_data, weight_0, weight_k, factors)
    return true_labels - preds

In [None]:
eta, lambdas = 0.001, 0.1
beta1, beta2 = 0.9, 0.999
eps = 1e-8

batch_size = 128

num_iter, losses, last_k_losses = 0, [], []

In [None]:
while True:
    num_iter += 1

    if num_iter % 10 == 0:
        errs_validation = get_errors(X_test, weight_0, weight_k, factors, y_test)
        rmse_loss = np.sqrt(np.mean(errs_validation**2))

        losses.append(rmse_loss)

        print rmse_loss

        if rmse_loss < 0.5:
            break
    
    selected_rows = random.sample(range(X_train.shape[0]), batch_size)
    
    selected_data = X_train[selected_rows,:]
    selected_labels = np.asarray(y_train)[selected_rows]

    errs_train = get_errors(selected_data, weight_0, weight_k, factors, selected_labels)
    
    x, u1, v1 = weight_0, weight_0_m, weight_0_v
    
    grad = -(np.sum(errs_train) - lambdas * x)
    
    u1 = beta1 * u1 + (1 - beta1) * grad
    v1 = beta2 * v1 + (1 - beta2) * (grad**2)
    
    x += -eta * u1/(np.sqrt(v1) + eps)
    
    weight_0, weight_0_m, weight_0_v = x, u1, v1
    
    
    x, u1, v1 = weight_k, weight_k_m, weight_k_v
    
    grad = -(selected_data.T.dot(errs_train) - lambdas * x)
    
    u1 = beta1 * u1 + (1 - beta1) * grad
    v1 = beta2 * v1 + (1 - beta2) * (grad**2)
    
    x += -eta * u1/(np.sqrt(v1) + eps)
    
    weight_k, weight_k_m, weight_k_v = x, u1, v1
    
    
    x, u1, v1 = factors, factors_m, factors_v
    
    a, b = selected_data.dot(x), selected_data.T.multiply(errs_train).T
    c = selected_data.power(2).T.multiply(errs_train).T.tocsc()
    
    f = b.T.dot(a)
    g = csr_matrix(([], ([], [])), shape=(x.shape[0], x.shape[1]))
    
    for k in range(batch_size):
        g += c[k,:].T.multiply(x)
    
    h = f - g.toarray()
    
    grad = -(h - lambdas * x)
    
    u1 = beta1 * u1 + (1 - beta1) * grad
    v1 = beta2 * v1 + (1 - beta2) * (grad**2)
    
    x += -eta * u1/(np.sqrt(v1) + eps)
    
    factors, factors_m, factors_v = x, u1, v1