In [1]:
# training a model with pre-defined (and saved) train / test datasets
# here we repeat the steps from DLRM_train_test_split but prepare the data for the model by (i) method # used in pytorch example and (ii) use Integer Encoding (from kaggle)

In [2]:
# reading the data ml-latest-small
import pandas as pd
import zipfile
import numpy as np 
zf = zipfile.ZipFile('/home/elena/Downloads/ml-latest-small.zip')
# reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv(zf.open('ml-latest-small/ratings.csv'), names=r_cols)
m_cols=['movie_id', 'title', 'genre']
movies = pd.read_csv(zf.open('ml-latest-small/movies.csv'), names=m_cols)
# merging ratings and movies
ratings=pd.merge(ratings,movies,on='movie_id')
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,title,genre
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [12]:
def create_dataset(ratings, top=None):
    if top is not None:
        ratings.groupby('user_id')['rating'].count()
    
    unique_users = ratings.user_id.unique()
    older_users = ratings.user_id
    user_to_index = {old: new for new, old in enumerate(unique_users)}
    # enumerate(unique_users) returns index_of_user_id and user_id, so new= ndex_of_user_id, old=user_id
    new_users = ratings.user_id.map(user_to_index)
    
    unique_movies = ratings.movie_id.unique()
    older_movies = ratings.movie_id
    movie_to_index = {old: new for new, old in enumerate(unique_movies)}
    new_movies = ratings.movie_id.map(movie_to_index)
    
    n_users = unique_users.shape[0]
    n_movies = unique_movies.shape[0]
    
    X = pd.DataFrame({'user_id': new_users,'user_id_old': ratings.user_id, 'movie_id': new_movies, 'movie_id_old': ratings.movie_id, 'title': ratings.title.values})
    y = ratings['rating'].astype(np.float32)
    return (n_users, n_movies), (X, y), (user_to_index, movie_to_index)

In [13]:
(n, m), (X, y), _ = create_dataset(ratings)

In [15]:
# splitting into train and test data
# data was split and safed! 
train=pd.read_pickle('/home/elena/Downloads/traindata.pkl')
test=pd.read_pickle('/home/elena/Downloads/testdata.pkl')

In [178]:
train.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code,movie_id,rating,unix_timestamp,title,genre
97717,606,28,M,programmer,63044,3462,4.0,1171501099,Modern Times (1936),Comedy|Drama|Romance
100124,610,22,M,student,21227,8914,4.0,1493845360,Primer (2004),Drama|Sci-Fi
25952,180,22,F,administrator,60202,1196,4.0,1270237862,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi
25871,178,26,M,other,49512,2231,4.5,1163673637,Rounders (1998),Drama
97255,605,33,M,engineer,33716,1588,4.0,1277094877,George of the Jungle (1997),Children|Comedy


In [16]:
columns=['user_id', 'movie_id', 'rating', 'title']
train=train[columns]
test=test[columns]

In [17]:
(n_train, m_train), (X_train, y_train), _ = create_dataset(train)
(n_test, m_test), (X_test, y_test), _ = create_dataset(test)

In [18]:
X_train.head()

Unnamed: 0,user_id,user_id_old,movie_id,movie_id_old,title
97717,0,606,0,3462,Modern Times (1936)
100124,1,610,1,8914,Primer (2004)
25952,2,180,2,1196,Star Wars: Episode V - The Empire Strikes Back...
25871,3,178,3,2231,Rounders (1998)
97255,4,605,4,1588,George of the Jungle (1997)


In [19]:
print(f'Embeddings: {n_train} users, {m_train} movies')
print(f'Dataset shape: {X_train.shape}')
print(f'Target shape: {y_train.shape}')

Embeddings: 610 users, 8762 movies
Dataset shape: (75627, 5)
Target shape: (75627,)


In [20]:
print(f'Test Data: {n_test} users, {m_test} movies')
print(f'Dataset shape: {X_test.shape}')
print(f'Target shape: {y_test.shape}')

Test Data: 610 users, 5672 movies
Dataset shape: (25209, 5)
Target shape: (25209,)


In [21]:
# Implementation 
from keras.layers import Input, Embedding, Concatenate, Flatten, Dense, Dot, Add, Multiply, Subtract, Average
from keras.models import Model
from keras.callbacks import EarlyStopping

Using TensorFlow backend.


In [22]:
def embedding_model(hidden_units, user_embedding_dim, user_max_cat_value, movie_embedding_dim, movie_max_cat_value,merging_method):
    # Each instance will consist of two inputs: a single user id, and a single movie id
    user_id_input = Input(shape=(1,), name='user_id')
    movie_id_input = Input(shape=(1,), name='movie_id')
    # Embeddings
    user_embedded = Embedding(user_max_cat_value+1, user_embedding_dim, 
                                       input_length=1, name='user_embedding')(user_id_input)
    movie_embedded = Embedding(movie_max_cat_value+1, movie_embedding_dim, 
                                        input_length=1, name='movie_embedding')(movie_id_input)
    # merging the embeddings
    if merging_method=='concatenate':
        merged = Concatenate()([user_embedded, movie_embedded])
    if merging_method=='dot_product':
        merged =Dot(name = 'dot_product', normalize = True, axes = 2)([user_embedded, movie_embedded])
    if merging_method=='add':
        merged =Add()([user_embedded, movie_embedded])
    if merging_method=='substract':
        merged=Subtract()([user_embedded, movie_embedded])
    if merging_method=='multiply':
        merged=Multiply()([user_embedded, movie_embedded])
    if merging_method=='average':
        merged=Average()([user_embedded, movie_embedded])
    out = Flatten()(merged)

    # Add one or more hidden layers
    for n_hidden in hidden_units:
        out = Dense(n_hidden, activation='relu')(out)

    # A single output: our predicted rating
    out = Dense(1, activation='linear', name='prediction')(out)
    model = Model(inputs = [user_id_input, movie_id_input],outputs = out)
    model.compile(optimizer = 'Adam',loss='MSE',metrics=['MAE'])
    return model

In [26]:
# parameters for the models used
hidden_units = (100,50) #same as in pytorch model
movie_embedding_dim = 50 #same as in pytorch model
user_embedding_dim = 50  #same as in pytorch model
user_max_cat_value = n
movie_max_cat_value=m 
es=EarlyStopping(monitor='val_MAE', min_delta=0, patience=0, verbose=0, mode='min', baseline=None, restore_best_weights=False)

In [27]:
model_concatenate=embedding_model(hidden_units, user_embedding_dim, user_max_cat_value, movie_embedding_dim, movie_max_cat_value, merging_method='concatenate')
# model_concatenate.summary(line_length=88)
trained_model_concatenate= model_concatenate.fit(x=[X_train.user_id, X_train.movie_id], y=y_train, batch_size=500,epochs=10, verbose=2, validation_data=[[X_test.user_id, X_test.movie_id],y_test], callbacks=[es])

Train on 75627 samples, validate on 25209 samples
Epoch 1/10
 - 6s - loss: 2.9159 - MAE: 1.2425 - val_loss: 1.3725 - val_MAE: 0.9333
Epoch 2/10
 - 5s - loss: 0.7310 - MAE: 0.6581 - val_loss: 1.4237 - val_MAE: 0.9466


In [188]:
# 1. run
# Epoch 2/10  - 3s - loss: 0.7313 - MAE: 0.6582 - val_loss: 1.4151 - val_MAE: 0.9443
# compare to 
#  Epch 4/19 - 48s - loss: 0.6777 - MAE: 0.6295 - val_loss: 0.7709 - val_MAE: 0.6740

In [28]:
model_dot=embedding_model(hidden_units, user_embedding_dim, user_max_cat_value, movie_embedding_dim, movie_max_cat_value, merging_method='dot_product')
trained_model_dot= model_dot.fit(x=[X_train.user_id, X_train.movie_id], y=y_train, batch_size=500,epochs=10, verbose=2, validation_data=[[X_test.user_id, X_test.movie_id],y_test], callbacks=[es])

Train on 75627 samples, validate on 25209 samples
Epoch 1/10
 - 5s - loss: 4.5304 - MAE: 1.6750 - val_loss: 1.0834 - val_MAE: 0.8260
Epoch 2/10
 - 5s - loss: 1.0752 - MAE: 0.8265 - val_loss: 1.0894 - val_MAE: 0.8339


In [30]:
# 2. run
# Epoch 2/10  - 3s - loss: 1.0364 - MAE: 0.8117 - val_loss: 1.1419 - val_MAE: 0.8593
# compare to without data prep
# Epoch 2/10  - 57s - loss: 1.0672 - MAE: 0.8238 - val_loss: 1.0869 - val_MAE: 0.8376

In [31]:
model_add=embedding_model(hidden_units, user_embedding_dim, user_max_cat_value, movie_embedding_dim, movie_max_cat_value, merging_method='add')
trained_model_add= model_add.fit(x=[X_train.user_id, X_train.movie_id], y=y_train, batch_size=500,epochs=10, verbose=2, validation_data=[[X_test.user_id, X_test.movie_id],y_test], callbacks=[es])

Train on 75627 samples, validate on 25209 samples
Epoch 1/10
 - 6s - loss: 3.1761 - MAE: 1.3064 - val_loss: 1.4247 - val_MAE: 0.9541
Epoch 2/10
 - 5s - loss: 0.7321 - MAE: 0.6580 - val_loss: 1.4546 - val_MAE: 0.9602


In [193]:
# 1.Run 
# Epoch 2/10  - 3s - loss: 0.7324 - MAE: 0.6590 - val_loss: 1.4445 - val_MAE: 0.9573
# compare to without data prep
# Epoch 3/10 - 32s - loss: 0.6899 - MAE: 0.6363 - val_loss: 0.7743 - val_MAE: 0.6801

In [32]:
model_substract=embedding_model(hidden_units, user_embedding_dim, user_max_cat_value, movie_embedding_dim, movie_max_cat_value, merging_method='substract')
trained_model_substract= model_substract.fit(x=[X_train.user_id, X_train.movie_id], y=y_train, batch_size=500,epochs=10, verbose=2, validation_data=[[X_test.user_id, X_test.movie_id],y_test], callbacks=[es])

Train on 75627 samples, validate on 25209 samples
Epoch 1/10
 - 5s - loss: 3.0893 - MAE: 1.2854 - val_loss: 1.4392 - val_MAE: 0.9617
Epoch 2/10
 - 6s - loss: 0.7312 - MAE: 0.6580 - val_loss: 1.4453 - val_MAE: 0.9568
Epoch 3/10
 - 6s - loss: 0.6913 - MAE: 0.6372 - val_loss: 1.4925 - val_MAE: 0.9708


In [33]:
# 1. run
# Epoch 2/10  - 3s - loss: 0.7352 - MAE: 0.6599 - val_loss: 1.4790 - val_MAE: 0.9682
# compare to without data prep
# Epoch 7/10  - 46s - loss: 0.6186 - MAE: 0.5960 - val_loss: 0.7775 - val_MAE: 0.6773

In [34]:
model_multiply=embedding_model(hidden_units, user_embedding_dim, user_max_cat_value, movie_embedding_dim, movie_max_cat_value, merging_method='multiply')
history_multiply= model_multiply.fit(x=[X_train.user_id, X_train.movie_id], y=y_train, batch_size=500,epochs=10, verbose=2, validation_data=[[X_test.user_id, X_test.movie_id],y_test], callbacks=[es])

Train on 75627 samples, validate on 25209 samples
Epoch 1/10
 - 6s - loss: 4.5613 - MAE: 1.6909 - val_loss: 1.0838 - val_MAE: 0.8307
Epoch 2/10
 - 6s - loss: 0.9243 - MAE: 0.7547 - val_loss: 1.2060 - val_MAE: 0.8850


In [197]:
# 1. run
# Epoch 2/10  - 3s - loss: 0.9186 - MAE: 0.7515 - val_loss: 1.2040 - val_MAE: 0.8827
# compare to without data prep
# Epoch 3/10  - 46s - loss: 0.4674 - MAE: 0.5177 - val_loss: 0.8758 - val_MAE: 0.7270

In [47]:
# example prediction

In [48]:
# User 1
uid = 1
user_ratings = ratings[ratings.user_id==uid]
print("User #{} has rated {} movies (avg. rating = {:.1f}):".format(
    uid, len(user_ratings), user_ratings['rating'].mean(),
))
user_ratings.sort_values(by='rating', ascending=False)

User #1 has rated 232 movies (avg. rating = 4.4):


Unnamed: 0,user_id,movie_id,rating,unix_timestamp,title,genre
16250,1,5060,5.0,964984002,M*A*S*H (a.k.a. MASH) (1970),Comedy|Drama|War
14053,1,2872,5.0,964981680,Excalibur (1981),Adventure|Fantasy
9066,1,1291,5.0,964981909,Indiana Jones and the Last Crusade (1989),Action|Adventure
9206,1,1298,5.0,964984086,Pink Floyd: The Wall (1982),Drama|Musical
14254,1,2948,5.0,964982191,From Russia with Love (1963),Action|Adventure|Thriller
...,...,...,...,...,...,...
12997,1,2617,2.0,964982588,"Mummy, The (1999)",Action|Adventure|Comedy|Fantasy|Horror|Thriller
11663,1,2253,2.0,964981775,Toys (1992),Comedy|Fantasy
11991,1,2338,2.0,964983546,I Still Know What You Did Last Summer (1998),Horror|Mystery|Thriller
12117,1,2389,2.0,964983094,Psycho (1998),Crime|Horror|Thriller


In [49]:
# predicting for movies from the test data
movies_from_test=X_test.movie_id[X_test.user_id_old==uid].unique()
# prediction 
pred_concatenate=model_concatenate.predict([[uid]*len(movies_from_test),movies_from_test])
pred_dot=model_dot.predict([[uid]*len(movies_from_test),movies_from_test])
pred_add=model_add.predict([[uid]*len(movies_from_test),movies_from_test])
pred_substract=model_substract.predict([[uid]*len(movies_from_test),movies_from_test])
pred_multiply=model_multiply.predict([[uid]*len(movies_from_test),movies_from_test])

In [53]:
user_data_test=test[test.user_id==uid]
user_data_test['model_concatenate']=pred_concatenate
user_data_test['model_dot']=pred_dot
user_data_test['model_add']=pred_add
user_data_test['model_substract']=pred_substract
user_data_test['model_multiply']=pred_multiply
user_data_test.sort_values(by='rating', ascending=False).head()

Unnamed: 0,user_id,movie_id,rating,title,model_concatenate,model_dot,model_add,model_substract,model_multiply
115,1,1954,5.0,Rocky (1976),3.951483,3.421226,3.860736,3.863217,3.769209
156,1,2427,5.0,"Thin Red Line, The (1998)",4.18834,3.42234,3.943762,3.956832,3.73762
8,1,151,5.0,Rob Roy (1995),4.110387,3.460751,4.457175,4.356219,3.617298
88,1,1282,5.0,Fantasia (1940),3.653706,3.476704,3.590794,3.459525,3.499617
147,1,2329,5.0,American History X (1998),4.042581,3.421099,3.649195,4.041459,3.816945


In [54]:
movie_name=lambda movie: X.title[X.movie_id==movie]
top5_model_concatenate_test=user_data_test.nlargest(5,'model_concatenate').movie_id.map(movie_name)
top5_model_dot_test=user_data_test.nlargest(5,'model_dot').movie_id.map(movie_name)
top5_model_add_test=user_data_test.nlargest(5,'model_add').movie_id.map(movie_name)
top5_model_substract_test=user_data_test.nlargest(5,'model_substract').movie_id.map(movie_name)
top5_model_multiply_test=user_data_test.nlargest(5,'model_multiply').movie_id.map(movie_name)

pd.DataFrame({'top5_model_concatenate': top5_model_concatenate_test.values, 'top5_model_dot':top5_model_dot_test.values, 'top5_model_add':  top5_model_add_test.values, 'top5_model_substract': top5_model_substract_test.values, 'top5_model_mulitply': top5_model_multiply_test.values})

Unnamed: 0,top5_model_concatenate,top5_model_dot,top5_model_add,top5_model_substract,top5_model_mulitply
0,79094 Harold and Maude (1971) 79095 Haro...,79094 Harold and Maude (1971) 79095 Haro...,70504 Dark Days (2000) 70505 Dark Days (...,79094 Harold and Maude (1971) 79095 Haro...,"85367 Story of Women (Affaire de femmes, Un..."
1,"85367 Story of Women (Affaire de femmes, Un...",70503 Stray Dog (Nora inu) (1949) Name: tit...,"85367 Story of Women (Affaire de femmes, Un...","85367 Story of Women (Affaire de femmes, Un...",59893 The Devil's Advocate (1997) 59894 ...
2,70504 Dark Days (2000) 70505 Dark Days (...,"53224 Bourne Identity, The (2002) 53225 ...","53224 Bourne Identity, The (2002) 53225 ...","53224 Bourne Identity, The (2002) 53225 ...","53224 Bourne Identity, The (2002) 53225 ..."
3,59893 The Devil's Advocate (1997) 59894 ...,"45702 Fifth Element, The (1997) 45703 Fi...",79094 Harold and Maude (1971) 79095 Haro...,70504 Dark Days (2000) 70505 Dark Days (...,70504 Dark Days (2000) 70505 Dark Days (...
4,"53224 Bourne Identity, The (2002) 53225 ...",59893 The Devil's Advocate (1997) 59894 ...,12101 Very Bad Things (1998) 12102 Very ...,12101 Very Bad Things (1998) 12102 Very ...,"32998 Aristocats, The (1970) 32999 Arist..."


In [55]:
# the intersection between the above columns are greater then for DLRM_train_test_split

In [58]:
# lets now prepare the data using encoding;
# in kaggle is was stated: One embedding layer is required for each categorical variable, and the 
# embedding expects the categories to be ordinal encoded, although no relationship between the 
# categories is assumed
# previous code: columns=['user_id', 'movie_id', 'rating', 'title']
# train=train[columns], test=test[columns]

In [91]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder	

In [137]:
from collections import defaultdict
d = defaultdict(LabelEncoder)
# Encoding the variable
ratings_enc= ratings.apply(lambda x: d[x.name].fit_transform(x))
# Inverse the encoded if needed: ratings_enc.apply(lambda x: d[x.name].inverse_transform(x))
# Using the dictionary to label future data
train_enc=train.apply(lambda x: d[x.name].transform(x))
test_enc=test.apply(lambda x: d[x.name].transform(x))
X_train_enc, X_test_enc=train_enc.drop('rating',axis=1), test_enc.drop('rating',axis=1)
y_train_enc=train_enc.rating
y_test_enc=test_enc.rating

In [138]:
trained_model_concatenate_enc= model_concatenate.fit(x=[X_train_enc.user_id, X_train_enc.movie_id], y=y_train_enc, batch_size=500,epochs=10, verbose=2, validation_data=[[X_test_enc.user_id, X_test_enc.movie_id],y_test_enc], callbacks=[es])

Train on 75627 samples, validate on 25209 samples
Epoch 1/10
 - 4s - loss: 3.6856 - MAE: 1.4867 - val_loss: 3.0786 - val_MAE: 1.3585
Epoch 2/10
 - 4s - loss: 2.8512 - MAE: 1.2985 - val_loss: 3.0467 - val_MAE: 1.3404
Epoch 3/10
 - 4s - loss: 2.7113 - MAE: 1.2592 - val_loss: 3.0108 - val_MAE: 1.3299
Epoch 4/10
 - 4s - loss: 2.5990 - MAE: 1.2277 - val_loss: 2.9773 - val_MAE: 1.3238
Epoch 5/10
 - 4s - loss: 2.4960 - MAE: 1.1997 - val_loss: 2.9910 - val_MAE: 1.3138
Epoch 6/10
 - 4s - loss: 2.4163 - MAE: 1.1765 - val_loss: 2.9930 - val_MAE: 1.3247


In [139]:
trained_model_dot_enc= model_dot.fit(x=[X_train_enc.user_id, X_train_enc.movie_id], y=y_train_enc, batch_size=500,epochs=10, verbose=2, validation_data=[[X_test_enc.user_id, X_test_enc.movie_id],y_test_enc], callbacks=[es])

Train on 75627 samples, validate on 25209 samples
Epoch 1/10
 - 4s - loss: 4.6630 - MAE: 1.7233 - val_loss: 4.3283 - val_MAE: 1.6490
Epoch 2/10
 - 3s - loss: 4.1099 - MAE: 1.6057 - val_loss: 4.2320 - val_MAE: 1.6342
Epoch 3/10
 - 3s - loss: 2.8036 - MAE: 1.2483 - val_loss: 4.0573 - val_MAE: 1.5719
Epoch 4/10
 - 4s - loss: 1.8753 - MAE: 0.9828 - val_loss: 4.3442 - val_MAE: 1.6293


In [140]:
trained_model_add_enc= model_add.fit(x=[X_train_enc.user_id, X_train_enc.movie_id], y=y_train_enc, batch_size=500,epochs=10, verbose=2, validation_data=[[X_test_enc.user_id, X_test_enc.movie_id],y_test_enc], callbacks=[es])

Train on 75627 samples, validate on 25209 samples
Epoch 1/10
 - 4s - loss: 3.7912 - MAE: 1.5061 - val_loss: 3.0945 - val_MAE: 1.3574
Epoch 2/10
 - 3s - loss: 2.8353 - MAE: 1.2928 - val_loss: 3.0619 - val_MAE: 1.3439
Epoch 3/10
 - 3s - loss: 2.7199 - MAE: 1.2628 - val_loss: 3.0564 - val_MAE: 1.3349
Epoch 4/10
 - 3s - loss: 2.6495 - MAE: 1.2429 - val_loss: 3.0513 - val_MAE: 1.3389


In [141]:
trained_model_substract_enc= model_substract.fit(x=[X_train_enc.user_id, X_train_enc.movie_id], y=y_train_enc, batch_size=500,epochs=10, verbose=2, validation_data=[[X_test_enc.user_id, X_test_enc.movie_id],y_test_enc], callbacks=[es])

Train on 75627 samples, validate on 25209 samples
Epoch 1/10
 - 4s - loss: 3.7466 - MAE: 1.4967 - val_loss: 3.0887 - val_MAE: 1.3615
Epoch 2/10
 - 3s - loss: 2.8346 - MAE: 1.2934 - val_loss: 3.0448 - val_MAE: 1.3471
Epoch 3/10
 - 3s - loss: 2.7029 - MAE: 1.2570 - val_loss: 3.0510 - val_MAE: 1.3465
Epoch 4/10
 - 3s - loss: 2.6026 - MAE: 1.2292 - val_loss: 3.0385 - val_MAE: 1.3368
Epoch 5/10
 - 3s - loss: 2.4852 - MAE: 1.1970 - val_loss: 3.0578 - val_MAE: 1.3298
Epoch 6/10
 - 4s - loss: 2.3384 - MAE: 1.1562 - val_loss: 3.0739 - val_MAE: 1.3417


In [142]:
trained_model_multiply_enc= model_multiply.fit(x=[X_train_enc.user_id, X_train_enc.movie_id], y=y_train_enc, batch_size=500,epochs=10, verbose=2, validation_data=[[X_test_enc.user_id, X_test_enc.movie_id],y_test_enc], callbacks=[es])

Train on 75627 samples, validate on 25209 samples
Epoch 1/10
 - 3s - loss: 4.4342 - MAE: 1.6637 - val_loss: 3.6382 - val_MAE: 1.4878
Epoch 2/10
 - 3s - loss: 2.5127 - MAE: 1.2126 - val_loss: 3.5459 - val_MAE: 1.4560
Epoch 3/10
 - 3s - loss: 1.2740 - MAE: 0.8484 - val_loss: 3.6570 - val_MAE: 1.4843


In [143]:
# MUCH WORSE MAE!!!!!