In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import warnings


warnings.filterwarnings('ignore')
%matplotlib inline


%load_ext autoreload
%autoreload 2

# Dot Product with RMSE loss function

In [None]:
from helpers import load_data

DATA_TRAIN_PATH = "data/data_train.csv"
ratings = load_data(DATA_TRAIN_PATH)


DATA_TEST_PATH = "data/sampleSubmission.csv"
samples = load_data(DATA_TEST_PATH)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# take 15 more frequent users
g = data.groupby('user_id')['rating'].count()
top_users = g.sort_values(ascending=False)[:15]

# take 15 more frequent movies
g = data.groupby('movie_id')['rating'].count()
top_movies = g.sort_values(ascending=False)[:15]

# combine frequent users and most rated movies
top_r = data.join(top_users, rsuffix = '_r', how = 'inner', on = 'user_id')
top_r = top_r.join (top_movies, rsuffix = '_r', how = 'inner', on = 'movie_id')

pd.crosstab(top_r.user_id, top_r.movie_id, top_r.rating, aggfunc = np.sum)

In [None]:
user_enc = LabelEncoder()
data ['user'] = user_enc.fit_transform(data['user_id'].values)
n_users = data ['user'].nunique()

item_enc = LabelEncoder()
data ['movie'] = item_enc.fit_transform(data['movie_id'].values)
n_movies = data ['movie'].nunique()


data ['rating'] = data ['rating'].values.astype(np.int)
min_rating = min(data['rating'])
max_rating = max(data ['rating'])

n_users, n_movies, max_rating, min_rating

In [None]:
X = data[['user', 'movie']].values
y = data ['rating']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.1, random_state=42)

In [None]:
n_factors = 50

X_train_array = [X_train[:,0], X_train[:,1]]
X_test_array = [X_test[:,0], X_test[:,1]]

In [None]:
from keras.models import Model,load_model
from keras.layers import Input, Reshape, Dot
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from keras.regularizers import l2
from keras.layers import Add, Activation, Lambda

class EmbeddingLayer:
    def __init__(self, n_items, n_factors):
        self.n_items = n_items
        self.n_factors = n_factors
    
    def __call__(self, x):
        x = Embedding(self.n_items, self.n_factors, embeddings_initializer='he_normal',
                      embeddings_regularizer=l2(1e-6))(x)
        x = Reshape((self.n_factors,))(x)
        return x
    
def RecommenderV2(n_users, n_movies, n_factors, min_rating, max_rating):
    """Create a dot product recommender system with MSE loss function, Adam optimizer and the use of 
    logistic function.
    
    n_users, n_movies: number of total users and movies
    n_factors:....
    min_rating, max_rating: minimum and maximum rate given in the all data set"""
    
    user = Input(shape=(1,))
    u = EmbeddingLayer(n_users, n_factors)(user)
    ub = EmbeddingLayer(n_users, 1)(user)
    
    movie = Input(shape=(1,))
    m = EmbeddingLayer(n_movies, n_factors)(movie)
    mb = EmbeddingLayer(n_movies, 1)(movie)
    x = Dot(axes=1)([u, m])
    x = Add()([x, ub, mb])
    x = Activation('sigmoid')(x)
    x = Lambda(lambda x: x * (max_rating - min_rating) + min_rating)(x)
    model = Model(inputs=[user, movie], outputs=x)
    opt = Adam(lr=0.001)
    model.compile(loss='mean_squared_error', optimizer=opt)
    return model

In [None]:
model = RecommenderV2(n_users, n_movies, n_factors, min_rating, max_rating)
model.summary()

if os.path.exists('RecommenderV2.h5'):d0f8ac2f7923fed0740af1bd51892b66ccd953ea
    model = load_model('RecommenderV2.h5')
else:
    history = model.fit(x=X_train_array, y=y_train,  batch_size=10000, epochs=10,verbose=1, validation_data=(X_test_array, y_test))
    model.save('RecommenderV2.h5')
    plt.plot(history.history['val_loss'])
    plt.xlabel("Epochs")
    plt.ylabel("Test Error")

In [None]:
from sklearn.metrics import mean_squared_error

prediction_test = np.array([a[0] for a in model.predict(x=X_test_array)])

print (mean_squared_error(y_test, prediction_test))