# Matrix Factorization

In [31]:
import numpy as np
import pandas as pd
from IPython.display import SVG

from keras.models import Model, load_model
from keras.utils.vis_utils import model_to_dot
from keras import layers
from keras.utils import plot_model
from keras.backend import mean
from keras.preprocessing import text
from keras import callbacks
from sklearn import preprocessing

from sklearn.model_selection import train_test_split
from sklearn import metrics

from sklearn.neighbors import NearestNeighbors

## Train

In [None]:
df = pd.read_csv("../../datasets/movielens/ml-20m/ratings.csv").sample(n=500000)

df_train, df_test = train_test_split(df)

n_movies = df.movieId.max()
n_users = df.userId.max()
n_latent_factors = 50

movie_input = layers.Input(shape=[1], name='Item')
movie_embedding = layers.Embedding(n_movies + 1,
                                   n_latent_factors,
                                   name='Movie-Embedding')(movie_input)
movie_vec = layers.Flatten()(movie_embedding)

user_input = layers.Input(shape=[1],
                          name='User')
user_embedding = layers.Embedding(n_users + 1,
                                  n_latent_factors,
                                  name='User-Embedding')(user_input)
user_vec = layers.Flatten()(user_embedding)

prod = layers.merge([movie_vec, user_vec], mode='dot', name='DotProduct')
model = Model([user_input, movie_input], prod)
model.compile('adam', 'mean_squared_error')

checkpoint = callbacks.ModelCheckpoint("bin/mf.h5",
                                       monitor='val_loss',
                                       verbose=0,
                                       save_best_only=True)


history = model.fit(x=[df_train.userId, df_train.movieId],
                    y=df_train.rating,
                    epochs=30,
                    verbose=1,
                    callbacks=[checkpoint],
                    validation_data=(
                        [df_test.userId, df_test.movieId],
                        df_test.rating)
                    )

## Predict

In [4]:
model = load_model("./bin/mf.h5")

  return cls(**config)


In [6]:
y_pred = model.predict(x=[df_test.userId, df_test.movieId])
y_true = df_test.rating.tolist()

In [7]:
E = model.get_layer("Movie-Embedding").get_weights()[0]

In [14]:
nbrs = NearestNeighbors(n_neighbors=20, algorithm='ball_tree').fit(E)

In [30]:
%time nbrs.kneighbors(E[5].reshape(1,-1))

CPU times: user 18.4 ms, sys: 2.13 ms, total: 20.5 ms
Wall time: 18.9 ms


(array([[0.        , 2.0756038 , 2.09173514, 2.09795381, 2.16329627,
         2.17545097, 2.18222597, 2.19936834, 2.20466627, 2.21050236,
         2.21092981, 2.21195929, 2.21676416, 2.21705692, 2.22327044,
         2.223486  , 2.22401078, 2.2289132 , 2.23346397, 2.24184798]]),
 array([[    5, 43869, 99750, 51004, 55255, 71533,   386,  7675,  8571,
          3149,  7940,  8639, 58992, 80469,  3445,  6162,  3850,  7168,
         39481,  4499]]))