# Custom Visualizations

In [1]:
# Setup
import utils
import matrix_factorization as mf
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%config InlineBackend.figure_format='retina'
sns.set()
sns.set_style("white")

In [20]:
Y_train = utils.get_training_data()
Y_test = utils.get_test_data()
Y = utils.get_data()
movie_id, movie_title, movie_genre, genres = utils.get_movies()
genre_similarity = utils.genre_similarity(movie_genre)

M = 943 # users
N = 1682 # movies
K = 20
reg = 0.1
eta = 0.03
num_genres = len(genres)

## Quantifying Critics

In [33]:
num_user_reviews = np.zeros((M, ))

for rating_index in range(Y.shape[0]):
    user_index = Y[rating_index][0] - 1
    num_user_reviews[user_index] += 1

from bokeh.plotting import figure, show
from bokeh.io import output_notebook

max_num_reviews = max(num_user_reviews)
bin_size = 20
bin_edges = list(range(0, int(max_num_reviews)+1, int(bin_size)))
bin_centers = np.array(bin_edges[1:]) - int(bin_size/2)
user_review_counts, _ = np.histogram(num_user_reviews, bins=bin_edges)
plot = figure(title="Number of reviews per user", plot_width=500, plot_height=500)
plot.vbar(x=bin_centers, top=user_review_counts, width=0.9*bin_size)

output_notebook()
show(plot)

In [95]:
num_user_reviews_by_genre = np.zeros((M, num_genres))

for rating_index in range(Y.shape[0]):
    user_index = Y[rating_index][0] - 1
    movie_index = Y[rating_index][1] - 1
    
    rating_genres = movie_genre[movie_index]
    
    # Normalize the rating genres
    rating_genres = rating_genres / sum(rating_genres)
    
    # Update the users genre rating row
    
    num_user_reviews_by_genre[user_index, :] = num_user_reviews_by_genre[user_index, :] + rating_genres

user_genre_variance = np.zeros((M, ))
for user_index in range(M):
    num_user_reviews_by_genre[user_index, :] = num_user_reviews_by_genre[user_index, :] / sum(num_user_reviews_by_genre[user_index, :])
    user_genre_variance[user_index] = np.std(num_user_reviews_by_genre[user_index, :])


max_variance = max(user_genre_variance)
bin_edges = np.linspace(0, max_variance, 20)
bin_size = bin_edges[1] - bin_edges[0]
bin_centers = np.array(bin_edges[1:]) - int(bin_size/2)
counts, _ = np.histogram(user_genre_variance, bins=bin_edges)
plot = figure(title="Genre variance by user", plot_width=500, plot_height=500)
plot.vbar(x=bin_centers, top=counts, width=0.9*bin_size)

show(plot)

In [96]:
U_bias, V_bias, biases, err_bias = mf.train_model(Y, M, N, K, eta, reg, include_bias=True)
err_test_bias = mf.get_err(U_bias, V_bias, Y_test, biases=biases)
err_test_bias /= Y_test.shape[0]

Epoch  0: current average training error 0.443
Epoch  1: current average training error 0.415
Epoch  2: current average training error 0.401
Epoch  3: current average training error 0.391
Epoch  4: current average training error 0.380
Epoch  5: current average training error 0.372
Epoch  6: current average training error 0.363
Epoch  7: current average training error 0.353
Epoch  8: current average training error 0.347
Epoch  9: current average training error 0.341
Epoch 10: current average training error 0.333
Epoch 11: current average training error 0.329
Epoch 12: current average training error 0.321
Epoch 13: current average training error 0.318
Epoch 14: current average training error 0.315
Epoch 15: current average training error 0.311
Epoch 16: current average training error 0.308
Epoch 17: current average training error 0.304
Epoch 18: current average training error 0.301
Epoch 19: current average training error 0.298
Epoch 20: current average training error 0.295
Epoch 21: cur

In [111]:
from bokeh.models import ColumnDataSource, LinearColorMapper

def get_SVD_user_projection(U):
    
     # SVD for the latent factor of the movies
    A, _, _ = np.linalg.svd(U)
    
    U_transformed = np.multiply(A[0:1, :].T, U)
    
    return U_transformed

def get_user_projection_plot(U, title, values, size=300):
    
    plot = figure(plot_width=size, plot_height=size, title=title)
    plot.xaxis.axis_label = "1"
    plot.yaxis.axis_label = "2"
    
    data_source = ColumnDataSource({"x": U[0, :], "y": U[1, :], "values": values})
    
    color_mapper = LinearColorMapper(palette='Magma256', low=min(values), high=max(values))
    plot.circle(x="x", y="y", color={'field': 'values', 'transform': color_mapper}, source=data_source, size=10)
    
    return plot

critic_rank =  (1 - user_genre_variance / max(user_genre_variance)) + (num_user_reviews / max(num_user_reviews))

U_bias_transformed = get_SVD_user_projection(U_bias)
plot = get_user_projection_plot(U_bias_transformed, "User SVD Projection", critic_rank, size=800)

show(plot)