In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import warnings


warnings.filterwarnings('ignore')
%matplotlib inline


%load_ext autoreload
%autoreload 2

# Neural Network with RMSE loss function

In [2]:
from helpers import load_data

DATA_TRAIN_PATH = "data/data_train.csv"
data = load_data(DATA_TRAIN_PATH)


DATA_TEST_PATH = "data/sampleSubmission.csv"
samples = load_data(DATA_TEST_PATH)

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# take 15 more frequent users
g = data.groupby('user_id')['rating'].count()
top_users = g.sort_values(ascending=False)[:15]

# take 15 more frequent movies
g = data.groupby('movie_id')['rating'].count()
top_movies = g.sort_values(ascending=False)[:15]

# combine frequent users and most rated movies
top_r = data.join(top_users, rsuffix = '_r', how = 'inner', on = 'user_id')
top_r = top_r.join (top_movies, rsuffix = '_r', how = 'inner', on = 'movie_id')

pd.crosstab(top_r.user_id, top_r.movie_id, top_r.rating, aggfunc = np.sum)

movie_id,134,14,156,178,256,46,471,495,594,596,6,60,608,668,978
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1000,5.0,5.0,,5.0,,5.0,5.0,,5.0,5.0,,,,5.0,3.0
1570,5.0,,,,,,,5.0,,5.0,5.0,5.0,,,5.0
1830,,,,,5.0,5.0,,5.0,,5.0,5.0,5.0,5.0,,
1878,5.0,5.0,,5.0,5.0,5.0,5.0,,,,,,,5.0,
2038,5.0,5.0,,,,,,,5.0,,5.0,5.0,,5.0,
4600,5.0,,5.0,5.0,5.0,,5.0,5.0,,5.0,,,,,5.0
5289,5.0,,5.0,5.0,5.0,5.0,5.0,,5.0,,5.0,5.0,5.0,5.0,
5512,,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,,,5.0,,5.0
7014,5.0,5.0,5.0,,,5.0,,5.0,,,,5.0,,,
8575,5.0,,5.0,,,5.0,5.0,5.0,5.0,,5.0,5.0,5.0,,5.0


In [4]:
user_enc = LabelEncoder()
data ['user'] = user_enc.fit_transform(data['user_id'].values)
samples ['user'] = user_enc.fit_transform(samples['user_id'].values)
n_users = data ['user'].nunique()

item_enc = LabelEncoder()
data ['movie'] = item_enc.fit_transform(data['movie_id'].values)
samples['movie'] = item_enc.fit_transform(samples['movie_id'].values)
n_movies = data ['movie'].nunique()


data ['rating'] = data ['rating'].values.astype(np.int)
n_factors = 50

In [5]:
from keras.utils import np_utils

X = data[['user', 'movie']].values
y = data ['rating']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.1, random_state=42)

X_train_array = [X_train[:,0], X_train[:,1]]
X_test_array = [X_test[:,0], X_test[:,1]]
encoder = LabelEncoder()
encoder.fit(y_train)
encoder_train = encoder.transform(y_train)
encoder_test = encoder.transform(y_test)

y_train = np_utils.to_categorical(encoder_train)
y_test = np_utils.to_categorical(encoder_test)

Using TensorFlow backend.


In [7]:
from keras.models import Model,load_model
from keras.layers import Input, Reshape, Dot
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from keras.regularizers import l2
from keras.layers import Concatenate, Dense, Dropout, Add, Activation, Lambda
from keras.callbacks import EarlyStopping
usualCallback = EarlyStopping()


In [14]:
from OptimizerLayerNeurons import optimizerLayerNeuron

nb_max_layers = 10
nb_max_neurons = 20
model = optimizerLayerNeuron(nb_max_layers,nb_max_neurons,n_users, n_movies, n_factors, X, y)

if os.path.exists('model_optimizerLayerNeuron.h5'):
    model = load_model('model_optimizerLayerNeuron.h5')
else:
    history_dummy = model.fit(x=X_train_array, y= y_train, batch_size=1024, epochs=10000, verbose=1, validation_data=(X_test_array,y_test), callbacks=[usualCallback])
    model.save('model_optimizerLayerNeuron.h5')
    plt.plot(history_dummy.history['val_loss'])
    plt.xlabel("Epochs")
    plt.ylabel("Test Error")


Train on 1059256 samples, validate on 117696 samples
Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Train on 1059256 samples, validate on 117696 samples
Epoch 1/10000

KeyboardInterrupt: 

## Optimize Layers, Neurons, Dropouts & Models

In [26]:
X_samples = samples[['user', 'movie']].values
X_samples_array = [X_samples[:,0], X_samples[:,1]]

In [27]:
sample_pred = model.predict(X_samples_array)

In [38]:
rating_samples = (np.argmax(sample_pred,1)+1).tolist()

DATA_TEST_PATH = "data/sampleSubmission.csv"
samples = load_data(DATA_TEST_PATH)

samples['rating'] = rating_samples

In [41]:
from helpers import create_csv
PATH_SUBMISSION = "DummyNeuralNetworkPrediction.csv"
create_csv(PATH_SUBMISSION, samples)

In [43]:
X=np.random.randint(1000, size=(32, 10))

X.shape

(32, 10)