In [1]:
import pandas as pd
import numpy as np
from scipy.optimize import fmin_cg


def simple_fit(ui_matrix, r_matrix, num_features, lam):
    """
    Ajusta el modelo de regresion dada una matriz de calificaciones y
    una mascara de calificados
    """
    y = ui_matrix
    r = r_matrix
    num_items, num_users = y.shape

    theta0 = np.random.rand(num_users, num_features)
    x0 = np.random.rand(num_items, num_features)

    def fold_matrices(x_matrix, theta_matrix):
        return np.concatenate([x_matrix.flatten(), theta_matrix.flatten()])

    def unfold_vector(x):
        x_matrix = np.reshape(x[:x0.size],
                              x0.shape)
        theta_matrix = np.reshape(x[x0.size:],
                                  theta0.shape)
        return x_matrix, theta_matrix

    def unfold_parameter(f):
        def wrapper(x):
            return f(*unfold_vector(x))

        return wrapper

    @unfold_parameter
    def optimization_target(x, theta):
        differences = r * (x @ theta.T - y)
        square_error = (0.5) * np.sum(differences**2)
        regularization = (lam / 2) * (np.sum(x**2) + np.sum(x**2))

        return square_error + regularization

    @unfold_parameter
    def gradient(x, theta):
        differences = np.multiply((np.dot(x, theta.T) - y), r)
        x_grad = np.dot(differences, theta) + lam * x
        theta_grad = np.dot(x.T, differences).T + lam * theta

        return fold_matrices(x_grad, theta_grad)

    init_fold = fold_matrices(x0, theta0)
    result = fmin_cg(f=optimization_target, x0=init_fold, fprime=gradient)

    x, theta = unfold_vector(result)

    return x, theta

In [2]:
def normalized_fit(y, *args):
    means = np.nanmean(y, axis=1)
    y = y - means.reshape(-1, 1)

    r = -(np.isnan(y).astype(int) - 1)
    y = np.nan_to_num(y)

    x, theta = simple_fit(y, r, *args)

    return x, theta, means

In [3]:
ratings_df = pd.read_csv('datasets/ml-latest-small/ratings.csv')
movies = pd.read_csv('datasets/ml-latest-small/movies.csv', index_col=0)

ratings_df.columns = ['user_id', 'movie_id', 'rating', 'timestamp']

user_ratings = ratings_df.pivot(index='movie_id', columns='user_id', values='rating')

In [4]:
x, theta, means = normalized_fit(user_ratings.values, 200, 0.2)

feature_df = pd.DataFrame(x, index=user_ratings.index)

         Current function value: 915.220243
         Iterations: 1381
         Function evaluations: 2409
         Gradient evaluations: 2397


In [30]:
user = user_ratings.sample(axis=1)
user_id = user.columns[0]
user = user.assign(title=movies.title[user_ratings.index])
user.columns = ['rating', 'title']
user.sort_values(by='rating', ascending=False).head(20)

Unnamed: 0_level_0,rating,title
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
47,5.0,Seven (a.k.a. Se7en) (1995)
318,5.0,"Shawshank Redemption, The (1994)"
1704,5.0,Good Will Hunting (1997)
1196,5.0,Star Wars: Episode V - The Empire Strikes Back...
1387,5.0,Jaws (1975)
1407,5.0,Scream (1996)
1625,5.0,"Game, The (1997)"
1617,5.0,L.A. Confidential (1997)
1689,4.0,"Man Who Knew Too Little, The (1997)"
1672,4.0,"Rainmaker, The (1997)"


In [31]:
theta_df = pd.DataFrame(theta, index=user_ratings.columns)
user_theta = theta_df.loc[user_id]

pred = (user_theta.values @ x.T) + means

In [32]:
user['predicted'] = pred
user.sort_values(by='predicted', ascending=False).head(20)

Unnamed: 0_level_0,rating,title,predicted
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
593,,"Silence of the Lambs, The (1991)",5.344535
2571,,"Matrix, The (1999)",5.034324
4088,,"Big Town, The (1987)",5.000031
92494,,Dylan Moran: Monster (2004),5.000028
5960,,Bad Influence (1990),5.000025
3216,,"Vampyros Lesbos (Vampiras, Las) (1971)",5.000024
4617,,Let It Ride (1989),5.000023
4522,,Masquerade (1988),5.000022
3038,,"Face in the Crowd, A (1957)",5.000014
26151,,Au Hasard Balthazar (1966),5.000005
