In [1]:
# training a model with Cross-Validation
# we use the data preperation functions from DLRM_train_test_split2 becaucse the run times for the models when # fiting where the fastest!

In [2]:
# reading the data ml-latest-small
import pandas as pd
import zipfile
import numpy as np 
zf = zipfile.ZipFile('/home/elena/Downloads/ml-latest-small.zip')
# reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv(zf.open('ml-latest-small/ratings.csv'), names=r_cols)
m_cols=['movie_id', 'title', 'genre']
movies = pd.read_csv(zf.open('ml-latest-small/movies.csv'), names=m_cols)
# merging ratings and movies
ratings=pd.merge(ratings,movies,on='movie_id')
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,title,genre
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [16]:
def create_dataset(ratings):
    unique_movies = ratings.movie_id.unique() # returns a np array
    movie_to_index = {old: new for new, old in enumerate(unique_movies)} # indexing movie_id, tart at 0
    index_to_movie = {idx: movie for movie, idx in movie_to_index.items()}
    new_movies = ratings.movie_id.map(movie_to_index) # replaces movie_id with coresp. index
    ratings['movie_index']=new_movies
    y=ratings['rating']
    X=ratings.drop('rating', axis=1)
    return (X, y)

In [17]:
(X,y) = create_dataset(ratings)

In [18]:
X.head()

Unnamed: 0,user_id,movie_id,unix_timestamp,title,genre,movie_index
0,1,1,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0
1,5,1,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0
2,7,1,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0
3,15,1,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0
4,17,1,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0


In [19]:
# Implementation 
from keras.layers import Input, Embedding, Concatenate, Flatten, Dense, Dot, Add, Multiply, Subtract, Average
from keras.models import Model
from keras.callbacks import EarlyStopping

In [20]:
def embedding_model(hidden_units, user_embedding_dim, user_max_cat_value, movie_embedding_dim, movie_max_cat_value,merging_method):
    # Each instance will consist of two inputs: a single user id, and a single movie id
    user_id_input = Input(shape=(1,), name='user_id')
    movie_id_input = Input(shape=(1,), name='movie_id')
    # Embeddings
    user_embedded = Embedding(user_max_cat_value+1, user_embedding_dim, 
                                       input_length=1, name='user_embedding')(user_id_input)
    movie_embedded = Embedding(movie_max_cat_value+1, movie_embedding_dim, 
                                        input_length=1, name='movie_embedding')(movie_id_input)
    # merging the embeddings
    if merging_method=='concatenate':
        merged = Concatenate()([user_embedded, movie_embedded])
    if merging_method=='dot_product':
        merged =Dot(name = 'dot_product', normalize = True, axes = 2)([user_embedded, movie_embedded])
    if merging_method=='add':
        merged =Add()([user_embedded, movie_embedded])
    if merging_method=='substract':
        merged=Subtract()([user_embedded, movie_embedded])
    if merging_method=='multiply':
        merged=Multiply()([user_embedded, movie_embedded])
    if merging_method=='average':
        merged=Average()([user_embedded, movie_embedded])
    out = Flatten()(merged)

    # Add one or more hidden layers
    for n_hidden in hidden_units:
        out = Dense(n_hidden, activation='relu')(out)

    # A single output: our predicted rating
    out = Dense(1, activation='linear', name='prediction')(out)
    model = Model(inputs = [user_id_input, movie_id_input],outputs = out)
    model.compile(optimizer = 'Adam',loss='MSE',metrics=['MAE'])
    return model

In [21]:
hidden_units = (100,50) #same as in pytorch model
movie_embedding_dim = 50 #same as in pytorch model
user_embedding_dim = 50  #same as in pytorch model
user_max_cat_value = X.user_id.unique().shape[0]
movie_max_cat_value=X.movie_index.unique().shape[0]
model=embedding_model(hidden_units, user_embedding_dim, user_max_cat_value, movie_embedding_dim, movie_max_cat_value, merging_method='concatenate')

In [22]:
# build the scikit-learn interface for the keras model
from keras.wrappers.scikit_learn import KerasRegressor
from tensorflow import random
seed = 1
np.random.seed(seed)
random.set_seed(seed)
# 
the_model=KerasRegressor(build_fn=model, epochs=10, batch_size=500, verbose=2)

In [23]:
# define the iterator to perform 5-foldd cross-validation
from sklearn.model_selection import KFold, cross_val_score
kf=KFold(n_splits=5)
results=cross_val_score(the_model,X.user_id, X.movie_index,y,cv=kf)

In [24]:
results

array([nan, nan, nan, nan, nan])

In [25]:
# does not work
# lets do it manually

In [26]:
kfold =KFold(n_splits=5, shuffle=True, random_state=seed)

In [28]:
model=embedding_model(hidden_units, user_embedding_dim, user_max_cat_value, movie_embedding_dim, movie_max_cat_value, merging_method='concatenate')
model.compile(optimizer = 'Adam',loss='MSE',metrics=['MAE'])

In [32]:
X_train=pd.DataFrame()
X_test=pd.DataFrame()
y_train=pd.DataFrame()
y_test=pd.DataFrame()
scores = []
for train_i, test_i in kf.split(X):
    X_train, X_test=X.iloc[train_i], X.iloc[test_i]
    y_train, y_test=y.iloc[train_i], y.iloc[test_i]
    model.fit(x=[X_train.user_id,X_train.movie_index], y=y_train, epochs=10, batch_size=500, verbose=0)
    result=model.evaluate(x=[X_test.user_id,X_test.movie_index],y=y_test,verbose=0)
    print("%s: %.2f " % (model.metrics_names[1], result[1]))
    scores.append(result[1])

MAE: 0.92 
MAE: 0.68 
MAE: 0.47 
MAE: 0.30 
MAE: 0.20 


In [37]:
# 1. run
# MAE: 0.97 
# MAE: 0.69 
# MAE: 0.51 
# MAE: 0.31 
# MAE: 0.20 

In [35]:
X_test.head()

Unnamed: 0,user_id,movie_id,unix_timestamp,title,genre,movie_index
80669,57,3773,965797677,House Party (1990),Comedy,3046
80670,294,3773,966596414,House Party (1990),Comedy,3046
80671,57,3774,965797855,House Party 2 (1991),Comedy|Drama|Romance,3047
80672,294,3774,966597088,House Party 2 (1991),Comedy|Drama|Romance,3047
80673,307,3774,1186258870,House Party 2 (1991),Comedy|Drama|Romance,3047


In [36]:
movies_test=X_test[X_test.user_id==1]

In [38]:
model.predict([1*len(movies_test),movies_test])

AttributeError: 'int' object has no attribute 'ndim'