In [1]:
# training a model with Cross-Validation
# we use the data preperation functions from DLRM_train_test_split2 becaucse the run times for the models when # fiting where the fastest!

In [2]:
# reading the data ml-latest-small
import pandas as pd
import zipfile
import numpy as np 
zf = zipfile.ZipFile('/home/elena/Downloads/ml-latest-small.zip')
# reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv(zf.open('ml-latest-small/ratings.csv'), names=r_cols)
m_cols=['movie_id', 'title', 'genre']
movies = pd.read_csv(zf.open('ml-latest-small/movies.csv'), names=m_cols)
# merging ratings and movies
ratings=pd.merge(ratings,movies,on='movie_id')
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,title,genre
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [3]:
def create_dataset(ratings, top=None):
    if top is not None:
        ratings.groupby('user_id')['rating'].count()
    
    unique_users = ratings.user_id.unique()
    older_users = ratings.user_id
    user_to_index = {old: new for new, old in enumerate(unique_users)}
    # index_to_user={new: old for old, new in user_to_index.items()}?
    new_users = ratings.user_id.map(user_to_index)
    
    unique_movies = ratings.movie_id.unique()
    movie_to_index = {old: new for new, old in enumerate(unique_movies)}
    # index_to_movie{new: old for old, new in movie_to_index.items()}
    new_movies = ratings.movie_id.map(movie_to_index)
    
    n_users = unique_users.shape[0]
    n_movies = unique_movies.shape[0]
    
    X = pd.DataFrame({'user_id': new_users,'user_id_old': ratings.user_id, 'movie_id': new_movies, 'title': ratings.title.values})
    y = ratings['rating'].astype(np.float32)
    return (n_users, n_movies), (X, y), (user_to_index, movie_to_index)

In [4]:
(n, m), (X, y), _ = create_dataset(ratings)

In [5]:
X.head()

Unnamed: 0,user_id,user_id_old,movie_id,title
0,0,1,0,Toy Story (1995)
1,1,5,0,Toy Story (1995)
2,2,7,0,Toy Story (1995)
3,3,15,0,Toy Story (1995)
4,4,17,0,Toy Story (1995)


In [6]:
print(f'Embeddings: {n} users, {m} movies')
print(f'Dataset shape: {X.shape}')
print(f'Target shape: {y.shape}')

Embeddings: 610 users, 9724 movies
Dataset shape: (100836, 4)
Target shape: (100836,)


In [7]:
# Implementation 
from keras.layers import Input, Embedding, Concatenate, Flatten, Dense, Dot, Add, Multiply, Subtract, Average
from keras.models import Model
from keras.callbacks import EarlyStopping

Using TensorFlow backend.


In [8]:
def embedding_model(hidden_units, user_embedding_dim, user_max_cat_value, movie_embedding_dim, movie_max_cat_value,merging_method):
    # Each instance will consist of two inputs: a single user id, and a single movie id
    user_id_input = Input(shape=(1,), name='user_id')
    movie_id_input = Input(shape=(1,), name='movie_id')
    # Embeddings
    user_embedded = Embedding(user_max_cat_value+1, user_embedding_dim, 
                                       input_length=1, name='user_embedding')(user_id_input)
    movie_embedded = Embedding(movie_max_cat_value+1, movie_embedding_dim, 
                                        input_length=1, name='movie_embedding')(movie_id_input)
    # merging the embeddings
    if merging_method=='concatenate':
        merged = Concatenate()([user_embedded, movie_embedded])
    if merging_method=='dot_product':
        merged =Dot(name = 'dot_product', normalize = True, axes = 2)([user_embedded, movie_embedded])
    if merging_method=='add':
        merged =Add()([user_embedded, movie_embedded])
    if merging_method=='substract':
        merged=Subtract()([user_embedded, movie_embedded])
    if merging_method=='multiply':
        merged=Multiply()([user_embedded, movie_embedded])
    if merging_method=='average':
        merged=Average()([user_embedded, movie_embedded])
    out = Flatten()(merged)

    # Add one or more hidden layers
    for n_hidden in hidden_units:
        out = Dense(n_hidden, activation='relu')(out)

    # A single output: our predicted rating
    out = Dense(1, activation='linear', name='prediction')(out)
    return Model(inputs = [user_id_input, movie_id_input],outputs = out)

In [9]:
hidden_units = (100,50) #same as in pytorch model
movie_embedding_dim = 50 #same as in pytorch model
user_embedding_dim = 50  #same as in pytorch model
user_max_cat_value = n
movie_max_cat_value=m 
model=embedding_model(hidden_units, user_embedding_dim, user_max_cat_value, movie_embedding_dim, movie_max_cat_value, merging_method='concatenate')

In [10]:
# build the scikit-learn interface for the keras model
from keras.wrappers.scikit_learn import KerasRegressor
from tensorflow import random
seed = 1
np.random.seed(seed)
random.set_seed(seed)
# 
the_model=KerasRegressor(build_fn=model, epochs=10, batch_size=500, verbose=2)

In [11]:
# define the iterator to perform 5-foldd cross-validation
from sklearn.model_selection import KFold, cross_val_score
kf=KFold(n_splits=5)
results=cross_val_score(the_model,X.user_id, X.movie_id,y,cv=kf)

In [12]:
results

array([nan, nan, nan, nan, nan])

In [13]:
# does not work
# lets do it manually

In [14]:
kfold =KFold(n_splits=5, shuffle=True, random_state=seed)

In [15]:
model=embedding_model(hidden_units, user_embedding_dim, user_max_cat_value, movie_embedding_dim, movie_max_cat_value, merging_method='concatenate')
model.compile(optimizer = 'Adam',loss='MSE',metrics=['MAE'])
X_train=pd.DataFrame()
X_test=pd.DataFrame()
y_train=pd.DataFrame()
y_test=pd.DataFrame()
scores = []
for train_i, test_i in kf.split(X):
    X_train, X_test=X.iloc[train_i], X.iloc[test_i]
    y_train, y_test=y.iloc[train_i], y.iloc[test_i]
    model.fit(x=[X_train.user_id,X_train.movie_id], y=y_train, epochs=10, batch_size=500, verbose=0)
    result=model.evaluate(x=[X_test.user_id,X_test.movie_id],y=y_test,verbose=0)
    print("%s: %.2f " % (model.metrics_names[1], result[1]))
    scores.append(result[1])

MAE: 0.98 
MAE: 0.65 
MAE: 0.45 
MAE: 0.31 
MAE: 0.21 


In [16]:
# 1. run
# MAE: 0.97 
# MAE: 0.69 
# MAE: 0.51 
# MAE: 0.31 
# MAE: 0.20 

In [17]:
model=embedding_model(hidden_units, user_embedding_dim, user_max_cat_value, movie_embedding_dim, movie_max_cat_value, merging_method='concatenate')
model.compile(optimizer = 'Adam',loss='MSE',metrics=['MAE'])
yy=ratings['rating']
XX=ratings.drop('rating', axis=1)
trainX=pd.DataFrame()
testX=pd.DataFrame()
target_train=pd.DataFrame()
target_test=pd.DataFrame()
scores2 = []
for train_i, test_i in kf.split(X):
    trainX, testX=XX.iloc[train_i], XX.iloc[test_i]
    target_train, target_test=yy.iloc[train_i], yy.iloc[test_i]
    model.fit(x=[trainX.user_id,trainX.movie_id], y=target_train, epochs=10, batch_size=500, verbose=0)
    result=model.evaluate(x=[testX.user_id,testX.movie_id],y=target_test,verbose=0)
    print("%s: %.2f" % (model.metrics_names[1], result[1]))
    scores2.append(result[1])

InvalidArgumentError:  indices[6,0] = 48394 is not in [0, 9725)
	 [[node movie_embedding_7/embedding_lookup (defined at /home/elena/.local/share/virtualenvs/kaggle_new-dNsOYfOQ/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:3009) ]] [Op:__inference_keras_scratch_graph_71026]

Function call stack:
keras_scratch_graph
