# 5 Matrix Factorization Visualizations

In [1]:
# Setup
import utils
import matrix_factorization as mf
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%config InlineBackend.figure_format='retina'
sns.set()
sns.set_style("white")

In [2]:
Y_train = utils.get_training_data()
Y_test = utils.get_test_data()
Y = utils.get_data()
movie_id, movie_title, movie_genre, genres = utils.get_movies()
genre_similarity = utils.genre_similarity(movie_genre)

M = 943 # users
N = 1682 # movies
K = 20
reg = 0.1
eta = 0.03

## 1. Simple SGD from Homework

In [11]:
U_simple, V_simple, err_simple = mf.train_model(Y_train, M, N, K, eta, reg)
err_test_simple = mf.get_err(U_simple, V_simple, Y_test)
err_test_simple /= Y_test.shape[0]

Epoch  0: current average training error 0.508
Epoch  1: current average training error 0.432
Epoch  2: current average training error 0.407
Epoch  3: current average training error 0.391
Epoch  4: current average training error 0.380
Epoch  5: current average training error 0.367
Epoch  6: current average training error 0.359
Epoch  7: current average training error 0.350
Epoch  8: current average training error 0.344
Epoch  9: current average training error 0.339
Epoch 10: current average training error 0.330
Epoch 11: current average training error 0.322
Epoch 12: current average training error 0.323


## 2. Incorporating a Bias Term

In [12]:
U_bias, V_bias, biases, err_bias = mf.train_model(Y_train, M, N, K, eta, reg, include_bias=True)
err_test_bias = mf.get_err(U_bias, V_bias, Y_test, biases=biases)
err_test_bias /= Y_test.shape[0]

Epoch  0: current average training error 0.443
Epoch  1: current average training error 0.412
Epoch  2: current average training error 0.399
Epoch  3: current average training error 0.389
Epoch  4: current average training error 0.376
Epoch  5: current average training error 0.368
Epoch  6: current average training error 0.357
Epoch  7: current average training error 0.350
Epoch  8: current average training error 0.342
Epoch  9: current average training error 0.334
Epoch 10: current average training error 0.327
Epoch 11: current average training error 0.320
Epoch 12: current average training error 0.314
Epoch 13: current average training error 0.307
Epoch 14: current average training error 0.304
Epoch 15: current average training error 0.300
Epoch 16: current average training error 0.297
Epoch 17: current average training error 0.293
Epoch 18: current average training error 0.289
Epoch 19: current average training error 0.288
Epoch 20: current average training error 0.285
Epoch 21: cur

## Off-the-shelf solution

In [10]:
import surprise
from surprise import accuracy
from surprise import SVD
from surprise import Reader
from surprise import Dataset

pkf = surprise.model_selection.PredefinedKFold()
reader = Reader(rating_scale=(1, 5))

fulldata = Dataset.load_from_folds([("data/train.txt","data/test.txt")],reader)

surprise_SVD = SVD(n_factors = 20, n_epochs = 30, biased = True)

for trainset, testset in pkf.split(fulldata):

    # train and test algorithm.
    surprise_SVD.fit(trainset)
    predictions = surprise_SVD.test(testset)

    # Compute and print Root Mean Squared Error
    surprise_error = accuracy.rmse(predictions, verbose=True)
    

RMSE: 0.9266


In [13]:
accuracies = pd.DataFrame(columns=["Error"])
accuracies = accuracies.append(pd.Series([err_test_simple], name="Simple SVD", index=["Error"]))
accuracies = accuracies.append(pd.Series([err_test_bias], name="Biased SVD", index=["Error"]))
accuracies = accuracies.append(pd.Series([surprise_error], name="Surprise", index=["Error"]))
accuracies

Unnamed: 0,Error
Simple SVD,0.452937
Biased SVD,0.429117
Surprise,0.926644


## Visualization

### Movie projections

In [14]:
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.layouts import row

def get_SVD_movie_projection(V):
    
     # SVD for the latent factor of the movies
    A, _, _ = np.linalg.svd(V)
    
    V_transformed = np.multiply(A[0:1, :].T, V)
    
    return V_transformed

def get_movie_projection_plot(V, title, size=300):
    
    plot = figure(plot_width=size, plot_height=size, title=title)
    plot.circle(V[0, :], V[1, :])
    plot.xaxis.axis_label = "1"
    plot.yaxis.axis_label = "2"
    plot.toolbar_location = None
    
    return plot

V_simple_transformed = get_SVD_movie_projection(V_simple)
V_bias_transformed = get_SVD_movie_projection(V_bias)
V_surprise_transformed = surprise_SVD.pu[:, 0:2].T

simple_SVD_projection_plot = get_movie_projection_plot(V_simple_transformed, "Simple Movie SVD Projection")
bias_SVD_projection_plot = get_movie_projection_plot(V_bias_transformed, "Biased Movie SVD Projection")
off_the_shelf_projection_plot = get_movie_projection_plot(V_surprise_transformed, "Surprise Movie SVD Projection")
output_notebook()
show(row([simple_SVD_projection_plot, bias_SVD_projection_plot, off_the_shelf_projection_plot]))

In [None]:
A, S, B = np.linalg.svd(V_simple) # SVD for the latent factor of the movies
print(A.shape, S.shape, B.shape)
V2 = A.T[:2, :]@V_simple # Project V to 2D
print(V2.shape)

In [None]:
YM = utils.list_to_matrix(Y, M, N)
counts, ratings, ratings_bayesian = utils.bayesian_rating(Y, thr=4)
rank_counts = np.argsort(-counts) # Indices of counts in descending order
rank_ratings = np.argsort(-ratings) # Indices of ratings in descending order
rank_ratings_bayesian = np.argsort(-ratings_bayesian) # Indices of ratings in descending order
print(ratings[rank_ratings[:5]], counts[rank_ratings[:5]])
print(ratings[rank_ratings_bayesian[:5]], counts[rank_ratings_bayesian[:5]])
print(movie_title[rank_ratings_bayesian[:5]])

In [None]:
sns.set_style("ticks")
n_genres = len(genres)
genre_similarity_nodiag = genre_similarity.copy()
for i in range(n_genres):
    genre_similarity_nodiag[i, i] = np.nan
plt.figure(dpi=300)
plt.imshow(np.rot90(genre_similarity_nodiag, 2), extent=[0.5, n_genres+0.5, 0.5, n_genres+0.5],
          cmap='viridis')
plt.xticks(np.arange(n_genres)+1, genres[::-1], rotation='vertical')
plt.yticks(np.arange(n_genres)+1, genres)
plt.colorbar()
plt.title('Probability of a movie belonging to a certain combination of genres')
plt.show()

In [None]:
plt.figure(dpi=300)
plt.scatter(V2[0, :], V2[1, :], 5, ratings_bayesian, cmap='viridis')
plt.title('2D visualization of avarage movie ratings, bayesian corrected')
plt.colorbar()
plt.axis('off')
plt.show()

In [None]:
plt.figure(dpi=300)
plt.scatter(V2[0, :], V2[1, :], 5, counts, cmap='viridis')
plt.title('2D visualization of number of movie ratings')
plt.colorbar()
plt.axis('off')
plt.show()

In [None]:
plt.figure(dpi=300)
plt.scatter(V2[0, :], V2[1, :], 1, 'gray')
for idx in rank_ratings_bayesian[:10]:
    plt.scatter(V2[0, idx], V2[1, idx], 20, label=movie_title[idx])
plt.axis('off')
plt.legend(prop={'size': 5})
plt.title('2D visualization of the ten best movies\naccording to the bayesian corrected ratings')
plt.show()

In [None]:
plt.figure(dpi=300)
plt.scatter(V2[0, :], V2[1, :], 1, 'gray')
for idx in rank_counts[:10]:
    plt.scatter(V2[0, idx], V2[1, idx], 20, label=movie_title[idx])
plt.axis('off')
plt.legend(prop={'size': 5})
plt.title('2D visualization of the ten most popular movies')
plt.show()