# Dot Product with Categorical Cross Entropy (CCE) loss function

Recommender system with categorical cross entropy loss function with cross validation

In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt

from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

Using TensorFlow backend.


## Data - preprocessing

In [2]:
from helpers import load_data

DATA_TRAIN_PATH = "data/data_train.csv"
ratings = load_data(DATA_TRAIN_PATH)

DATA_TEST_PATH = "data/sampleSubmission.csv"
samples = load_data(DATA_TEST_PATH)

In [3]:
n_users = len(ratings.user_id.unique())
n_movies = len(ratings.movie_id.unique())

In [4]:
from sklearn.model_selection import train_test_split, KFold

# split of the data: 0.9 train, 0.1 test
train, test = train_test_split(ratings, test_size=0.1, random_state=42)

#######################
# A mettre dans une focntion : to_categorical

user_enc = LabelEncoder()
train['user'] = user_enc.fit_transform(train['user_id'].values)
test['user'] = user_enc.fit_transform(test['user_id'].values)

item_enc = LabelEncoder()
train['movie'] = item_enc.fit_transform(train['movie_id'].values)
test['movie'] = item_enc.fit_transform(test['movie_id'].values)

train['rating'] = train['rating'].values.astype(np.int)
test['rating'] = test['rating'].values.astype(np.int)
    
X_train = train[['user', 'movie']].values
y_train = train ['rating']
    
X_test = test[['user', 'movie']].values
y_test = test ['rating']
    
X_train_array = [X_train[:,0], X_train[:,1]]
X_test_array = [X_test[:,0], X_test[:,1]]
    
encoder = LabelEncoder()
encoder.fit(y_train)
encoder_train = encoder.transform(y_train)
encoder_test = encoder.transform(y_test)

y_train = np_utils.to_categorical(encoder_train)
y_test = np_utils.to_categorical(encoder_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_i

## Neural network model 

In [36]:
from keras.models import Model,load_model
from keras.layers import Input, Reshape, Dot, Flatten
from keras.layers import Concatenate, Dense, Dropout, Input, Reshape, Dot
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from keras.regularizers import l2
from keras_radam import RAdam

K=25;

class EmbeddingLayer:
    """Set embedding layer class"""
    def __init__(self, n_items, n_factors):
        self.n_items = n_items
        self.n_factors = n_factors
    
    def __call__(self, x):
        x = Embedding(self.n_items, self.n_factors, embeddings_initializer='he_normal',
                      embeddings_regularizer=l2(1e-6))(x)
        x = Reshape((self.n_factors,))(x)
        return x


def create_cce(n_users, n_movies, K, opt = RAdam()):
    
    # Input layer
    user = Input(shape=(1,))
    movie = Input(shape=(1,))
    
    # Embedding layers
    u = EmbeddingLayer(n_users, K)(user)
    
    m = EmbeddingLayer(n_movies, K)(movie)

#     ici changer pour mettre un dot product !!
#     x = Concatenate()([u, m])
#     x = Dropout(0.3)(x)
    
    prod = Dot(name="Dot-Product", axes=1)([u, m])
    
    # Output layer
    x = Dense(5, activation='softmax', kernel_initializer='he_normal')(prod)
    
    # Hidden layers
#     x = Dense(n_neurons, activation='relu', kernel_initializer='he_normal')(x)
#     x = Dropout(dropout)(x)
    
    model = Model([user, movie], x)
    model.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy', 'categorical_accuracy'])
    
    return model

In [37]:
model = create_cce(n_users, n_movies, K)
model.summary()

Model: "model_9"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_17 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
input_18 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_17 (Embedding)        (None, 1, 25)        250000      input_17[0][0]                   
__________________________________________________________________________________________________
embedding_18 (Embedding)        (None, 1, 25)        25000       input_18[0][0]                   
____________________________________________________________________________________________

In [None]:
history = model.fit(X_train_array, y_train, batch_size=1000, epochs=20, verbose=1)
#     plt.plot(history.history['val_loss'])
#     plt.xlabel("Epochs")
#     plt.ylabel("Test Error")

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20

In [35]:
model.evaluate(X_test_array, y_test)



[1.3945127342497934, 0.3704459071159363, 0.3704459071159363]

In [22]:
user_enc_samples = LabelEncoder()
samples['user'] = user_enc.fit_transform(samples['user_id'].values)

item_enc = LabelEncoder()
samples['movie'] = item_enc.fit_transform(samples['movie_id'].values)

samples['rating'] = samples['rating'].values.astype(np.int)
    
X_samples = samples[['user', 'movie']].values

X_samples_array = [X_samples[:,0], X_samples[:,1]]

samples_pred = model.predict(X_samples_array)

In [23]:
rating_samples = (np.argmax(samples_pred,1)+1).tolist()

In [24]:
submission = load_data(DATA_TEST_PATH)
submission['rating'] = rating_samples

In [25]:
from helpers import create_csv

DATA_SUBMISSION = "data/submission_dotprodCCE.csv"
create_csv(DATA_SUBMISSION, submission)

Results AICrowd : RMSE = ... ; Secondary = ...