In [1]:
import pandas as pd
import numpy as np

movies = pd.read_csv('datasets/ml-20m/movies.csv', index_col=0)

genome_tags = pd.read_csv('datasets/ml-20m/genome-tags.csv', index_col=0).tag
genome_scores = pd.read_csv('datasets/ml-20m/genome-scores.csv')
genome_scores.columns = ['movie_id', 'tag_id', 'relevance']
genome_matrix = genome_scores.pivot(columns='tag_id', index='movie_id',
                                    values='relevance')
genome_matrix.columns = [genome_tags[id] for id in genome_matrix.columns]

# reducimos las peliculas a las que tienen etiquetas
movies = movies.loc[genome_matrix.index]

# normalizamos el genoma
genome_matrix = genome_matrix * 2 - 1

genome_matrix.loc[1].sort_values(ascending=False).head(10)

toys                  0.9985
computer animation    0.9970
pixar animation       0.9920
kids and family       0.9815
animation             0.9715
kids                  0.9585
pixar                 0.9335
children              0.9285
cartoon               0.9130
imdb top 250          0.8840
Name: 1, dtype: float64

In [2]:
ratings = pd.read_csv('datasets/ml-20m/ratings.csv')
ratings.columns = ['user_id', 'movie_id', 'rating', 'timestamp']
# reducimos a solo las peliculas con etiquetas
ratings = ratings.loc[ratings.movie_id.isin(movies.index), :]

In [3]:
user = np.random.choice(ratings.user_id.unique())

user_ratings = ratings.loc[ratings.user_id == user]
user_ratings = user_ratings.assign(title=[movies.loc[id].title
                                          for id in user_ratings.movie_id])

print(user_ratings.shape)
user_ratings.sort_values(by='rating', ascending=False).head(20)

(82, 5)


Unnamed: 0,user_id,movie_id,rating,timestamp,title
13937369,96288,2997,5.0,1159793149,Being John Malkovich (1999)
13937382,96288,4848,5.0,1159792275,Mulholland Drive (2001)
13937362,96288,2329,5.0,1159794208,American History X (1998)
13937367,96288,2858,5.0,1159794335,American Beauty (1999)
13937348,96288,1227,5.0,1159794784,Once Upon a Time in America (1984)
13937347,96288,1222,5.0,1159794563,Full Metal Jacket (1987)
13937345,96288,1193,5.0,1159793167,One Flew Over the Cuckoo's Nest (1975)
13937376,96288,3949,5.0,1159792156,Requiem for a Dream (2000)
13937356,96288,1732,5.0,1159794733,"Big Lebowski, The (1998)"
13937406,96288,26150,5.0,1159794905,Andrei Rublev (Andrey Rublyov) (1969)


In [4]:
from scipy.optimize import fmin_cg


def fit_users(ratings, mask, features, means=None, lam=20):
    if means is None:
        means = np.zeros(ratings.shape)
    r = mask.astype(int).T
    ratings = ratings.T
    
    theta_shape = (ratings.shape[1], features.shape[1])
    
    theta0 = np.random.rand(*theta_shape).flatten()
    
    def optimization_target(folded):
        theta = np.reshape(folded, theta_shape)
        differences = r * (features @ theta.T - ratings)
        return 0.5 * np.sum(differences**2) + 0.5 * lam * np.sum(theta**2)

    def gradient(folded):
        theta = np.reshape(folded, theta_shape)
        differences = r * (features @ theta.T - ratings)

        return (differences.T @ features + lam * theta).flatten()

    theta = fmin_cg(f=optimization_target, x0=theta0, fprime=gradient)

    return theta

user_rating_vector = pd.Series(index=movies.index)

user_rating_vector[user_ratings.movie_id] = user_ratings.rating

user_ratings_matrix = user_rating_vector.values.reshape(1, -1)
ratings_mask = np.isfinite(user_ratings_matrix)
user_ratings_matrix = np.nan_to_num(user_ratings_matrix)

theta = fit_users(user_ratings_matrix, ratings_mask, genome_matrix.values)

Optimization terminated successfully.
         Current function value: 5.082708
         Iterations: 96
         Function evaluations: 210
         Gradient evaluations: 210


In [5]:
preferences = pd.Series(theta, index=genome_matrix.columns)

preferences.sort_values(ascending=False).head(20)

dark                  0.058778
masterpiece           0.057257
cult classic          0.056889
innocence lost        0.049008
oscar winner          0.048513
mentor                0.048007
mad scientist         0.043156
imagination           0.041792
imdb top 250          0.039944
love                  0.039931
dreams                0.039512
gunfight              0.039445
monster               0.038432
childhood             0.037762
prison                0.037117
social commentary     0.036180
weird                 0.035976
reflective            0.035807
great music           0.035077
alternate universe    0.034496
dtype: float64

In [6]:
predictions = pd.DataFrame(genome_matrix.values @ theta, index=movies.index)

predictions = predictions.assign(title=movies.title)
predictions.columns = ['rating', 'title']
predictions.loc[~ratings_mask.flatten(), :] \
    .sort_values(by=['rating'], ascending=False).head(20)

Unnamed: 0_level_0,rating,title
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
778,5.293866,Trainspotting (1996)
5147,5.287688,Wild Strawberries (Smultronstället) (1957)
1237,5.286941,"Seventh Seal, The (Sjunde inseglet, Det) (1957)"
111,5.256287,Taxi Driver (1976)
1206,5.253421,"Clockwork Orange, A (1971)"
1251,5.228539,8 1/2 (8½) (1963)
99764,5.175642,It's Such a Beautiful Day (2012)
7361,5.158738,Eternal Sunshine of the Spotless Mind (2004)
7068,5.119508,Last Year at Marienbad (L'Année dernière à Mar...
1228,5.119437,Raging Bull (1980)
