In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import warnings

warnings.filterwarnings('ignore')
%matplotlib inline

%load_ext autoreload
%autoreload 2

# Dot Product with RMSE loss function

In [7]:
from helpers import load_data

DATA_TRAIN_PATH = "data/data_train.csv"
ratings = load_data(DATA_TRAIN_PATH)

DATA_TEST_PATH = "data/sampleSubmission.csv"
samples = load_data(DATA_TEST_PATH)

In [72]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

user_enc = LabelEncoder()
ratings['user'] = user_enc.fit_transform(ratings['user_id'].values)
n_users = ratings['user'].nunique()

item_enc = LabelEncoder()
ratings['movie'] = item_enc.fit_transform(ratings['movie_id'].values)
n_movies = ratings['movie'].nunique()

ratings['rating'] = ratings['rating'].values.astype(np.int)

In [73]:
X = ratings[['user', 'movie']].values
y = ratings['rating']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.1, random_state=42)

In [10]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,user,movie
0,44,1,4,3779,0
1,61,1,3,5668,0
2,67,1,4,6334,0
3,72,1,3,6890,0
4,86,1,5,8445,0


In [74]:
n_factors = 5

X_train_array = [X_train[:,0], X_train[:,1]]
X_test_array = [X_test[:,0], X_test[:,1]]

In [75]:
from keras.models import Model,load_model
from keras.layers import Input, Reshape, Dot
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from keras.regularizers import l2

def RecommenderV1(n_users, n_movies, n_factors):
    """Create a dot product recommender system with MSE loss function and Adam optimizer"""
    user = Input(shape=(1,))
    u = Embedding(n_users, n_factors, embeddings_initializer='he_normal', embeddings_regularizer=l2(1e-6))(user)
    u = Reshape((n_factors,))(u)
    
    movie = Input(shape=(1,))
    m = Embedding(n_movies, n_factors, embeddings_initializer='he_normal', embeddings_regularizer=l2(1e-6))(movie)
    m = Reshape((n_factors,))(m)
    
    x = Dot(axes=1)([u, m])
    model = Model(inputs=[user, movie], outputs=x)
    opt = Adam(lr=0.001)
    model.compile(loss='mean_squared_error', optimizer=opt)
    return model

In [76]:
model = RecommenderV1(n_users, n_movies, n_factors)
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 1, 5)         50000       input_3[0][0]                    
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 1, 5)         5000        input_4[0][0]                    
____________________________________________________________________________________________

In [88]:
if os.path.exists('RecommenderV1.h5'):
    model = load_model('RecommenderV1.h5')
else:
    history = model.fit(x=X_train_array, y=y_train, batch_size=10000, epochs=5, verbose=1, validation_data=(X_test_array, y_test))
    model.save('RecommenderV1.h5')
    plt.plot(history.history['val_loss'])
    plt.xlabel("Epochs")
    plt.ylabel("Test Error")

In [89]:
from sklearn.metrics import mean_squared_error

prediction_test = np.array([a[0] for a in model.predict(x=X_test_array)])

print (mean_squared_error(y_test, prediction_test))

1.5807112037098587


In [90]:
user_enc = LabelEncoder()
samples['user'] = user_enc.fit_transform(samples['user_id'].values)
n_users = samples['user'].nunique()

item_enc = LabelEncoder()
samples['movie'] = item_enc.fit_transform(samples['movie_id'].values)
n_movies = samples['movie'].nunique()

samples['rating'] = samples['rating'].values.astype(np.int)

X_samples = samples[['user', 'movie']].values
X_indices = samples[['user_id', 'movie_id']].values
X_samples_array = [X_samples[:,0], X_samples[:,1]]

prediction = np.array([a[0] for a in model.predict(x=X_samples_array)])
prediction = np.rint(prediction)

In [91]:
submission = load_data(DATA_TEST_PATH)
submission['rating'] = prediction

In [92]:
from helpers import create_csv

DATA_SUBMISSION = "data/submission_dotprod.csv"
create_csv(DATA_SUBMISSION, submission)

In [95]:
test = False
for i in range(submission.index.stop):
    if (submission.rating[i] == 8):
        test = True

In [94]:
print(test)

True
