In [1]:
# training a model with pre-defined (and saved) train / test datasets
# here we repeat the steps from DLRM_train_test_split but prepare the data for the model by (i) method # used in pytorch example and (ii) use Integer Encoding (from kaggle)

In [3]:
# reading the data ml-latest-small
import pandas as pd
import zipfile
import numpy as np 
zf = zipfile.ZipFile('/home/elena/Downloads/ml-latest-small.zip')
# reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv(zf.open('ml-latest-small/ratings.csv'), names=r_cols)
m_cols=['movie_id', 'title', 'genre']
movies = pd.read_csv(zf.open('ml-latest-small/movies.csv'), names=m_cols)
# merging ratings and movies
ratings=pd.merge(ratings,movies,on='movie_id')
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,title,genre
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [4]:
def create_dataset(ratings):
    unique_movies = ratings.movie_id.unique() # returns a np array
    movie_to_index = {old: new for new, old in enumerate(unique_movies)} # indexing movie_id, tart at 0
    index_to_movie = {idx: movie for movie, idx in movie_to_index.items()}
    new_movies = ratings.movie_id.map(movie_to_index) # replaces movie_id with coresp. index
    ratings['movie_index']=new_movies

    train=pd.read_pickle('/home/elena/Downloads/traindata.pkl')
    test=pd.read_pickle('/home/elena/Downloads/testdata.pkl')
    train['movie_index']=train.movie_id.map(movie_to_index)
    test['movie_index']=test.movie_id.map(movie_to_index)

    X_train=train.drop('rating',axis=1)
    X_test=test.drop('rating', axis=1)

    y_train = train['rating'].astype(np.float32)
    y_test=test['rating'].astype(np.float32)
    return (X_train, y_train), (X_test,y_test)

In [5]:
(X_train, y_train), (X_test,y_test) = create_dataset(ratings)

In [7]:
X_train.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code,movie_id,unix_timestamp,title,genre,movie_index
97717,606,28,M,programmer,63044,3462,1171501099,Modern Times (1936),Comedy|Drama|Romance,1185
100124,610,22,M,student,21227,8914,1493845360,Primer (2004),Drama|Sci-Fi,2266
25952,180,22,F,administrator,60202,1196,1270237862,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi,68
25871,178,26,M,other,49512,2231,1163673637,Rounders (1998),Drama,2183
97255,605,33,M,engineer,33716,1588,1277094877,George of the Jungle (1997),Children|Comedy,1495


In [8]:
# Implementation 
from keras.layers import Input, Embedding, Concatenate, Flatten, Dense, Dot, Add, Multiply, Subtract, Average
from keras.models import Model
from keras.callbacks import EarlyStopping

Using TensorFlow backend.


In [9]:
def embedding_model(hidden_units, user_embedding_dim, user_max_cat_value, movie_embedding_dim, movie_max_cat_value,merging_method):
    # Each instance will consist of two inputs: a single user id, and a single movie id
    user_id_input = Input(shape=(1,), name='user_id')
    movie_id_input = Input(shape=(1,), name='movie_id')
    # Embeddings
    user_embedded = Embedding(user_max_cat_value+1, user_embedding_dim, 
                                       input_length=1, name='user_embedding')(user_id_input)
    movie_embedded = Embedding(movie_max_cat_value+1, movie_embedding_dim, 
                                        input_length=1, name='movie_embedding')(movie_id_input)
    # merging the embeddings
    if merging_method=='concatenate':
        merged = Concatenate()([user_embedded, movie_embedded])
    if merging_method=='dot_product':
        merged =Dot(name = 'dot_product', normalize = True, axes = 2)([user_embedded, movie_embedded])
    if merging_method=='add':
        merged =Add()([user_embedded, movie_embedded])
    if merging_method=='substract':
        merged=Subtract()([user_embedded, movie_embedded])
    if merging_method=='multiply':
        merged=Multiply()([user_embedded, movie_embedded])
    if merging_method=='average':
        merged=Average()([user_embedded, movie_embedded])
    out = Flatten()(merged)

    # Add one or more hidden layers
    for n_hidden in hidden_units:
        out = Dense(n_hidden, activation='relu')(out)

    # A single output: our predicted rating
    out = Dense(1, activation='linear', name='prediction')(out)
    model = Model(inputs = [user_id_input, movie_id_input],outputs = out)
    model.compile(optimizer = 'Adam',loss='MSE',metrics=['MAE'])
    return model

In [14]:
# parameters for the models used
hidden_units = (100,50) #same as in pytorch model
movie_embedding_dim = 50 #same as in pytorch model
user_embedding_dim = 50  #same as in pytorch model
user_max_cat_value = ratings.user_id.max()
movie_max_cat_value=max(X_train.movie_index.max(), X_test.movie_index.max())
es=EarlyStopping(monitor='val_MAE', min_delta=0, patience=0, verbose=0, mode='min', baseline=None, restore_best_weights=False)

In [16]:
model_concatenate=embedding_model(hidden_units, user_embedding_dim, user_max_cat_value, movie_embedding_dim, movie_max_cat_value, merging_method='concatenate')
# model_concatenate.summary(line_length=88)
trained_model_concatenate= model_concatenate.fit(x=[X_train.user_id, X_train.movie_index], y=y_train, batch_size=500,epochs=10, verbose=2, validation_data=[[X_test.user_id, X_test.movie_index],y_test], callbacks=[es])

Train on 75627 samples, validate on 25209 samples
Epoch 1/10
 - 3s - loss: 3.2219 - MAE: 1.3146 - val_loss: 0.7871 - val_MAE: 0.6871
Epoch 2/10
 - 3s - loss: 0.7327 - MAE: 0.6584 - val_loss: 0.7689 - val_MAE: 0.6761
Epoch 3/10
 - 3s - loss: 0.6935 - MAE: 0.6386 - val_loss: 0.7683 - val_MAE: 0.6747
Epoch 4/10
 - 3s - loss: 0.6786 - MAE: 0.6297 - val_loss: 0.7726 - val_MAE: 0.6728
Epoch 5/10
 - 3s - loss: 0.6666 - MAE: 0.6232 - val_loss: 0.7697 - val_MAE: 0.6755


In [17]:
# compare to 
#  Epch 4/19 - 48s - loss: 0.6777 - MAE: 0.6295 - val_loss: 0.7709 - val_MAE: 0.6740

In [18]:
model_dot=embedding_model(hidden_units, user_embedding_dim, user_max_cat_value, movie_embedding_dim, movie_max_cat_value, merging_method='dot_product')
trained_model_dot= model_dot.fit(x=[X_train.user_id, X_train.movie_index], y=y_train, batch_size=500,epochs=10, verbose=2, validation_data=[[X_test.user_id, X_test.movie_index],y_test], callbacks=[es])

Train on 75627 samples, validate on 25209 samples
Epoch 1/10
 - 3s - loss: 4.3300 - MAE: 1.6309 - val_loss: 1.0839 - val_MAE: 0.8247
Epoch 2/10
 - 3s - loss: 1.0619 - MAE: 0.8205 - val_loss: 1.0836 - val_MAE: 0.8289


In [19]:
# compare to without data prep
# Epoch 2/10  - 57s - loss: 1.0672 - MAE: 0.8238 - val_loss: 1.0869 - val_MAE: 0.8376

In [20]:
model_add=embedding_model(hidden_units, user_embedding_dim, user_max_cat_value, movie_embedding_dim, movie_max_cat_value, merging_method='add')
trained_model_add= model_add.fit(x=[X_train.user_id, X_train.movie_index], y=y_train, batch_size=500,epochs=10, verbose=2, validation_data=[[X_test.user_id, X_test.movie_index],y_test], callbacks=[es])

Train on 75627 samples, validate on 25209 samples
Epoch 1/10
 - 3s - loss: 3.2192 - MAE: 1.3174 - val_loss: 0.7944 - val_MAE: 0.6886
Epoch 2/10
 - 3s - loss: 0.7339 - MAE: 0.6592 - val_loss: 0.7746 - val_MAE: 0.6791
Epoch 3/10
 - 3s - loss: 0.6922 - MAE: 0.6377 - val_loss: 0.7754 - val_MAE: 0.6776
Epoch 4/10
 - 3s - loss: 0.6730 - MAE: 0.6267 - val_loss: 0.7746 - val_MAE: 0.6755
Epoch 5/10
 - 3s - loss: 0.6584 - MAE: 0.6184 - val_loss: 0.7721 - val_MAE: 0.6755


In [21]:
# compare to without data prep
# Epoch 3/10 - 32s - loss: 0.6899 - MAE: 0.6363 - val_loss: 0.7743 - val_MAE: 0.6801

In [22]:
model_substract=embedding_model(hidden_units, user_embedding_dim, user_max_cat_value, movie_embedding_dim, movie_max_cat_value, merging_method='substract')
trained_model_substract= model_substract.fit(x=[X_train.user_id, X_train.movie_index], y=y_train, batch_size=500,epochs=10, verbose=2, validation_data=[[X_test.user_id, X_test.movie_index],y_test], callbacks=[es])

Train on 75627 samples, validate on 25209 samples
Epoch 1/10
 - 4s - loss: 3.1736 - MAE: 1.3069 - val_loss: 0.7937 - val_MAE: 0.6889
Epoch 2/10
 - 3s - loss: 0.7325 - MAE: 0.6587 - val_loss: 0.7741 - val_MAE: 0.6762
Epoch 3/10
 - 3s - loss: 0.6919 - MAE: 0.6372 - val_loss: 0.7730 - val_MAE: 0.6760
Epoch 4/10
 - 3s - loss: 0.6740 - MAE: 0.6280 - val_loss: 0.7713 - val_MAE: 0.6751
Epoch 5/10
 - 3s - loss: 0.6593 - MAE: 0.6190 - val_loss: 0.7709 - val_MAE: 0.6738
Epoch 6/10
 - 3s - loss: 0.6457 - MAE: 0.6108 - val_loss: 0.7733 - val_MAE: 0.6769


In [33]:
# compare to without data prep
# Epoch 7/10  - 46s - loss: 0.6186 - MAE: 0.5960 - val_loss: 0.7775 - val_MAE: 0.6773

In [23]:
model_multiply=embedding_model(hidden_units, user_embedding_dim, user_max_cat_value, movie_embedding_dim, movie_max_cat_value, merging_method='multiply')
history_multiply= model_multiply.fit(x=[X_train.user_id, X_train.movie_index], y=y_train, batch_size=500,epochs=10, verbose=2, validation_data=[[X_test.user_id, X_test.movie_index],y_test], callbacks=[es])

Train on 75627 samples, validate on 25209 samples
Epoch 1/10
 - 3s - loss: 4.4019 - MAE: 1.6439 - val_loss: 1.0738 - val_MAE: 0.8216
Epoch 2/10
 - 3s - loss: 0.9384 - MAE: 0.7617 - val_loss: 0.8787 - val_MAE: 0.7299
Epoch 3/10
 - 3s - loss: 0.4896 - MAE: 0.5309 - val_loss: 0.8846 - val_MAE: 0.7308


In [24]:
# compare to without data prep
# Epoch 3/10  - 46s - loss: 0.4674 - MAE: 0.5177 - val_loss: 0.8758 - val_MAE: 0.7270

In [25]:
# example prediction

In [26]:
# User 1
uid = 1
user_ratings = ratings[ratings.user_id==uid]
print("User #{} has rated {} movies (avg. rating = {:.1f}):".format(
    uid, len(user_ratings), user_ratings['rating'].mean(),
))
user_ratings.sort_values(by='rating', ascending=False)

User #1 has rated 232 movies (avg. rating = 4.4):


Unnamed: 0,user_id,movie_id,rating,unix_timestamp,title,genre,movie_index
16250,1,5060,5.0,964984002,M*A*S*H (a.k.a. MASH) (1970),Comedy|Drama|War,231
14053,1,2872,5.0,964981680,Excalibur (1981),Adventure|Fantasy,185
9066,1,1291,5.0,964981909,Indiana Jones and the Last Crusade (1989),Action|Adventure,89
9206,1,1298,5.0,964984086,Pink Floyd: The Wall (1982),Drama|Musical,90
14254,1,2948,5.0,964982191,From Russia with Love (1963),Action|Adventure|Thriller,190
...,...,...,...,...,...,...,...
12997,1,2617,2.0,964982588,"Mummy, The (1999)",Action|Adventure|Comedy|Fantasy|Horror|Thriller,170
11663,1,2253,2.0,964981775,Toys (1992),Comedy|Fantasy,143
11991,1,2338,2.0,964983546,I Still Know What You Did Last Summer (1998),Horror|Mystery|Thriller,148
12117,1,2389,2.0,964983094,Psycho (1998),Crime|Horror|Thriller,152


In [28]:
# predicting for movies from the test data
movies_from_test=X_test.movie_id[X_test.user_id==uid].unique()
# prediction 
pred_concatenate=model_concatenate.predict([[uid]*len(movies_from_test),movies_from_test])
pred_dot=model_dot.predict([[uid]*len(movies_from_test),movies_from_test])
pred_add=model_add.predict([[uid]*len(movies_from_test),movies_from_test])
pred_substract=model_substract.predict([[uid]*len(movies_from_test),movies_from_test])
pred_multiply=model_multiply.predict([[uid]*len(movies_from_test),movies_from_test])

In [31]:
user_data_test=X_test[X_test.user_id==uid]
user_data_test['model_concatenate']=pred_concatenate
user_data_test['model_dot']=pred_dot
user_data_test['model_add']=pred_add
user_data_test['model_substract']=pred_substract
user_data_test['model_multiply']=pred_multiply
user_data_test.sort_values(by='model_concatenate', ascending=False).head()

Unnamed: 0,user_id,age,sex,occupation,zip_code,movie_id,unix_timestamp,title,genre,movie_index,model_concatenate,model_dot,model_add,model_substract,model_multiply
156,1,24,M,technician,85711,2427,964982242,"Thin Red Line, The (1998)",Action|Drama|War,156,5.259588,3.453155,5.149864,5.198905,4.512282
219,1,24,M,technician,85711,3578,964980668,Gladiator (2000),Action|Adventure|Drama,219,5.051665,3.467472,5.08053,4.647391,3.503902
23,1,24,M,technician,85711,423,964982363,Blown Away (1994),Action|Thriller,23,5.015301,3.456332,5.042354,4.842852,4.371572
182,1,24,M,technician,85711,2797,964981710,Big (1988),Comedy|Drama|Fantasy|Romance,182,4.916557,3.531247,4.847631,4.948329,3.343365
186,1,24,M,technician,85711,2899,964982703,Gulliver's Travels (1939),Adventure|Animation|Children,186,4.896867,3.489475,4.976969,5.029078,4.093573


In [33]:
movie_name=lambda movie: ratings.title[ratings.movie_id==movie]
top5_model_concatenate_test=user_data_test.nlargest(5,'model_concatenate').movie_id.map(movie_name)
top5_model_dot_test=user_data_test.nlargest(5,'model_dot').movie_id.map(movie_name)
top5_model_add_test=user_data_test.nlargest(5,'model_add').movie_id.map(movie_name)
top5_model_substract_test=user_data_test.nlargest(5,'model_substract').movie_id.map(movie_name)
top5_model_multiply_test=user_data_test.nlargest(5,'model_multiply').movie_id.map(movie_name)

pd.DataFrame({'top5_model_concatenate': top5_model_concatenate_test.values, 'top5_model_dot':top5_model_dot_test.values, 'top5_model_add':  top5_model_add_test.values, 'top5_model_substract': top5_model_substract_test.values, 'top5_model_mulitply': top5_model_multiply_test.values})

Unnamed: 0,top5_model_concatenate,top5_model_dot,top5_model_add,top5_model_substract,top5_model_mulitply
0,"12247 Thin Red Line, The (1998) 12248 Th...",12009 Enemy of the State (1998) 12010 En...,"12247 Thin Red Line, The (1998) 12248 Th...","12247 Thin Red Line, The (1998) 12248 Th...",7130 Raiders of the Lost Ark (Indiana Jones...
1,15656 Gladiator (2000) 15657 Gladiator (...,"11104 Jungle Book, The (1967) 11105 Jung...",15656 Gladiator (2000) 15657 Gladiator (...,14078 Gulliver's Travels (1939) 14079 Gu...,8944 Young Frankenstein (1974) 8945 Youn...
2,2946 Blown Away (1994) 2947 Blown Away (...,5795 Dumbo (1941) 5796 Dumbo (1941) 5797...,"11104 Jungle Book, The (1967) 11105 Jung...",13732 Big (1988) 13733 Big (1988) 13734 ...,"12247 Thin Red Line, The (1998) 12248 Th..."
3,13732 Big (1988) 13733 Big (1988) 13734 ...,11293 Indiana Jones and the Temple of Doom ...,2946 Blown Away (1994) 2947 Blown Away (...,12394 20 Dates (1998) 12395 20 Dates (19...,2946 Blown Away (1994) 2947 Blown Away (...
4,14078 Gulliver's Travels (1939) 14079 Gu...,7130 Raiders of the Lost Ark (Indiana Jones...,14078 Gulliver's Travels (1939) 14079 Gu...,"11104 Jungle Book, The (1967) 11105 Jung...",12394 20 Dates (1998) 12395 20 Dates (19...


In [55]:
# the intersection between the above columns are greater then for DLRM_train_test_split

In [58]:
# lets now prepare the data using encoding;
# in kaggle is was stated: One embedding layer is required for each categorical variable, and the 
# embedding expects the categories to be ordinal encoded, although no relationship between the 
# categories is assumed
# previous code: columns=['user_id', 'movie_id', 'rating', 'title']
# train=train[columns], test=test[columns]

In [91]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder	

In [137]:
from collections import defaultdict
d = defaultdict(LabelEncoder)
# Encoding the variable
ratings_enc= ratings.apply(lambda x: d[x.name].fit_transform(x))
# Inverse the encoded if needed: ratings_enc.apply(lambda x: d[x.name].inverse_transform(x))
# Using the dictionary to label future data
train_enc=train.apply(lambda x: d[x.name].transform(x))
test_enc=test.apply(lambda x: d[x.name].transform(x))
X_train_enc, X_test_enc=train_enc.drop('rating',axis=1), test_enc.drop('rating',axis=1)
y_train_enc=train_enc.rating
y_test_enc=test_enc.rating

In [138]:
trained_model_concatenate_enc= model_concatenate.fit(x=[X_train_enc.user_id, X_train_enc.movie_id], y=y_train_enc, batch_size=500,epochs=10, verbose=2, validation_data=[[X_test_enc.user_id, X_test_enc.movie_id],y_test_enc], callbacks=[es])

Train on 75627 samples, validate on 25209 samples
Epoch 1/10
 - 4s - loss: 3.6856 - MAE: 1.4867 - val_loss: 3.0786 - val_MAE: 1.3585
Epoch 2/10
 - 4s - loss: 2.8512 - MAE: 1.2985 - val_loss: 3.0467 - val_MAE: 1.3404
Epoch 3/10
 - 4s - loss: 2.7113 - MAE: 1.2592 - val_loss: 3.0108 - val_MAE: 1.3299
Epoch 4/10
 - 4s - loss: 2.5990 - MAE: 1.2277 - val_loss: 2.9773 - val_MAE: 1.3238
Epoch 5/10
 - 4s - loss: 2.4960 - MAE: 1.1997 - val_loss: 2.9910 - val_MAE: 1.3138
Epoch 6/10
 - 4s - loss: 2.4163 - MAE: 1.1765 - val_loss: 2.9930 - val_MAE: 1.3247


In [139]:
trained_model_dot_enc= model_dot.fit(x=[X_train_enc.user_id, X_train_enc.movie_id], y=y_train_enc, batch_size=500,epochs=10, verbose=2, validation_data=[[X_test_enc.user_id, X_test_enc.movie_id],y_test_enc], callbacks=[es])

Train on 75627 samples, validate on 25209 samples
Epoch 1/10
 - 4s - loss: 4.6630 - MAE: 1.7233 - val_loss: 4.3283 - val_MAE: 1.6490
Epoch 2/10
 - 3s - loss: 4.1099 - MAE: 1.6057 - val_loss: 4.2320 - val_MAE: 1.6342
Epoch 3/10
 - 3s - loss: 2.8036 - MAE: 1.2483 - val_loss: 4.0573 - val_MAE: 1.5719
Epoch 4/10
 - 4s - loss: 1.8753 - MAE: 0.9828 - val_loss: 4.3442 - val_MAE: 1.6293


In [140]:
trained_model_add_enc= model_add.fit(x=[X_train_enc.user_id, X_train_enc.movie_id], y=y_train_enc, batch_size=500,epochs=10, verbose=2, validation_data=[[X_test_enc.user_id, X_test_enc.movie_id],y_test_enc], callbacks=[es])

Train on 75627 samples, validate on 25209 samples
Epoch 1/10
 - 4s - loss: 3.7912 - MAE: 1.5061 - val_loss: 3.0945 - val_MAE: 1.3574
Epoch 2/10
 - 3s - loss: 2.8353 - MAE: 1.2928 - val_loss: 3.0619 - val_MAE: 1.3439
Epoch 3/10
 - 3s - loss: 2.7199 - MAE: 1.2628 - val_loss: 3.0564 - val_MAE: 1.3349
Epoch 4/10
 - 3s - loss: 2.6495 - MAE: 1.2429 - val_loss: 3.0513 - val_MAE: 1.3389


In [141]:
trained_model_substract_enc= model_substract.fit(x=[X_train_enc.user_id, X_train_enc.movie_id], y=y_train_enc, batch_size=500,epochs=10, verbose=2, validation_data=[[X_test_enc.user_id, X_test_enc.movie_id],y_test_enc], callbacks=[es])

Train on 75627 samples, validate on 25209 samples
Epoch 1/10
 - 4s - loss: 3.7466 - MAE: 1.4967 - val_loss: 3.0887 - val_MAE: 1.3615
Epoch 2/10
 - 3s - loss: 2.8346 - MAE: 1.2934 - val_loss: 3.0448 - val_MAE: 1.3471
Epoch 3/10
 - 3s - loss: 2.7029 - MAE: 1.2570 - val_loss: 3.0510 - val_MAE: 1.3465
Epoch 4/10
 - 3s - loss: 2.6026 - MAE: 1.2292 - val_loss: 3.0385 - val_MAE: 1.3368
Epoch 5/10
 - 3s - loss: 2.4852 - MAE: 1.1970 - val_loss: 3.0578 - val_MAE: 1.3298
Epoch 6/10
 - 4s - loss: 2.3384 - MAE: 1.1562 - val_loss: 3.0739 - val_MAE: 1.3417


In [142]:
trained_model_multiply_enc= model_multiply.fit(x=[X_train_enc.user_id, X_train_enc.movie_id], y=y_train_enc, batch_size=500,epochs=10, verbose=2, validation_data=[[X_test_enc.user_id, X_test_enc.movie_id],y_test_enc], callbacks=[es])

Train on 75627 samples, validate on 25209 samples
Epoch 1/10
 - 3s - loss: 4.4342 - MAE: 1.6637 - val_loss: 3.6382 - val_MAE: 1.4878
Epoch 2/10
 - 3s - loss: 2.5127 - MAE: 1.2126 - val_loss: 3.5459 - val_MAE: 1.4560
Epoch 3/10
 - 3s - loss: 1.2740 - MAE: 0.8484 - val_loss: 3.6570 - val_MAE: 1.4843


In [143]:
# MUCH WORSE MAE!!!!!