# 5 Matrix Factorization Visualizations

In [1]:
# Setup
import utils
import matrix_factorization as mf
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%config InlineBackend.figure_format='retina'
sns.set()
sns.set_style("white")

In [2]:
# Y_train = utils.get_training_data()
# Y_test = utils.get_test_data()
Y = utils.get_data()
movie_id, movie_title, movie_genre, genres = utils.get_movies()
genre_similarity = utils.genre_similarity(movie_genre)

M = 943 # users
N = 1682 # movies
K = 20
reg = 0.1
eta = 0.03

## 1. Simple SGD from Homework

In [None]:
U_simple, V_simple, err_simple = mf.train_model(Y, M, N, K, eta, reg)

## 2. Incorporating a Bias Term

In [None]:
U_bias, V_bias, err_bias = mf.train_model(Y, M, N, K, eta, reg, include_bias=True)

In [None]:
# print('Average training error =', '{:.3f}'.format(err/len(Y_train)))
# print('Average test error =', '{:.3f}'.format(err_test/len(Y_test)))

In [None]:
A, S, B = np.linalg.svd(V) # SVD for the latent factor of the movies
print(A.shape, S.shape, B.shape)
V2 = A.T[:2, :]@V # Project V to 2D
print(V2.shape)

In [None]:
YM = utils.list_to_matrix(Y, M, N)
counts, ratings, ratings_bayesian = utils.bayesian_rating(Y, thr=4)
rank_counts = np.argsort(-counts) # Indices of counts in descending order
rank_ratings = np.argsort(-ratings) # Indices of ratings in descending order
rank_ratings_bayesian = np.argsort(-ratings_bayesian) # Indices of ratings in descending order
print(ratings[rank_ratings[:5]], counts[rank_ratings[:5]])
print(ratings[rank_ratings_bayesian[:5]], counts[rank_ratings_bayesian[:5]])
print(movie_title[rank_ratings_bayesian[:5]])

In [None]:
sns.set_style("ticks")
n_genres = len(genres)
genre_similarity_nodiag = genre_similarity.copy()
for i in range(n_genres):
    genre_similarity_nodiag[i, i] = np.nan
plt.figure(dpi=300)
plt.imshow(np.rot90(genre_similarity_nodiag, 2), extent=[0.5, n_genres+0.5, 0.5, n_genres+0.5],
          cmap='viridis')
plt.xticks(np.arange(n_genres)+1, genres[::-1], rotation='vertical')
plt.yticks(np.arange(n_genres)+1, genres)
plt.colorbar()
plt.title('Probability of a movie belonging to a certain combination of genres')
plt.show()

In [None]:
plt.figure(dpi=300)
plt.scatter(V2[0, :], V2[1, :], 5, ratings_bayesian, cmap='viridis')
plt.title('2D visualization of avarage movie ratings, bayesian corrected')
plt.colorbar()
plt.axis('off')
plt.show()

In [None]:
plt.figure(dpi=300)
plt.scatter(V2[0, :], V2[1, :], 5, counts, cmap='viridis')
plt.title('2D visualization of number of movie ratings')
plt.colorbar()
plt.axis('off')
plt.show()

In [None]:
plt.figure(dpi=300)
plt.scatter(V2[0, :], V2[1, :], 1, 'gray')
for idx in rank_ratings_bayesian[:10]:
    plt.scatter(V2[0, idx], V2[1, idx], 20, label=movie_title[idx])
plt.axis('off')
plt.legend(prop={'size': 5})
plt.title('2D visualization of the ten best movies\naccording to the bayesian corrected ratings')
plt.show()

In [None]:
plt.figure(dpi=300)
plt.scatter(V2[0, :], V2[1, :], 1, 'gray')
for idx in rank_counts[:10]:
    plt.scatter(V2[0, idx], V2[1, idx], 20, label=movie_title[idx])
plt.axis('off')
plt.legend(prop={'size': 5})
plt.title('2D visualization of the ten most popular movies')
plt.show()