In [173]:
# training a model with pre-defined (and saved) train / test datasets
# here we repeat the steps from DLRM_train_test_split but prepare the data as was done in the pytorch model!

In [174]:
# reading the data ml-latest-small
import pandas as pd
import zipfile
import numpy as np 
zf = zipfile.ZipFile('/home/elena/Downloads/ml-latest-small.zip')
# reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv(zf.open('ml-latest-small/ratings.csv'), names=r_cols)
m_cols=['movie_id', 'title', 'genre']
movies = pd.read_csv(zf.open('ml-latest-small/movies.csv'), names=m_cols)
# merging ratings and movies
ratings=pd.merge(ratings,movies,on='movie_id')
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,title,genre
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [175]:
def create_dataset(ratings, top=None):
    if top is not None:
        ratings.groupby('user_id')['rating'].count()
    
    unique_users = ratings.user_id.unique()
    older_users = ratings.user_id
    user_to_index = {old: new for new, old in enumerate(unique_users)}
    # index_to_user={new: old for old, new in user_to_index.items()}?
    new_users = ratings.user_id.map(user_to_index)
    
    unique_movies = ratings.movie_id.unique()
    movie_to_index = {old: new for new, old in enumerate(unique_movies)}
    # index_to_movie{new: old for old, new in movie_to_index.items()}
    new_movies = ratings.movie_id.map(movie_to_index)
    
    n_users = unique_users.shape[0]
    n_movies = unique_movies.shape[0]
    
    X = pd.DataFrame({'user_id': new_users,'user_id_old': ratings.user_id, 'movie_id': new_movies, 'title': ratings.title.values})
    y = ratings['rating'].astype(np.float32)
    return (n_users, n_movies), (X, y), (user_to_index, movie_to_index)

In [176]:
(n, m), (X, y), _ = create_dataset(ratings)

In [177]:
# splitting into train and test data
# data was split and safed! 
train=pd.read_pickle('/home/elena/Downloads/traindata.pkl')
test=pd.read_pickle('/home/elena/Downloads/testdata.pkl')

In [178]:
train.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code,movie_id,rating,unix_timestamp,title,genre
97717,606,28,M,programmer,63044,3462,4.0,1171501099,Modern Times (1936),Comedy|Drama|Romance
100124,610,22,M,student,21227,8914,4.0,1493845360,Primer (2004),Drama|Sci-Fi
25952,180,22,F,administrator,60202,1196,4.0,1270237862,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi
25871,178,26,M,other,49512,2231,4.5,1163673637,Rounders (1998),Drama
97255,605,33,M,engineer,33716,1588,4.0,1277094877,George of the Jungle (1997),Children|Comedy


In [179]:
columns=['user_id', 'movie_id', 'rating', 'title']
train=train[columns]
test=test[columns]

In [180]:
(n_train, m_train), (X_train, y_train), _ = create_dataset(train)
(n_test, m_test), (X_test, y_test), _ = create_dataset(test)

In [181]:
X_train.head()

Unnamed: 0,user_id,user_id_old,movie_id,title
97717,0,606,0,Modern Times (1936)
100124,1,610,1,Primer (2004)
25952,2,180,2,Star Wars: Episode V - The Empire Strikes Back...
25871,3,178,3,Rounders (1998)
97255,4,605,4,George of the Jungle (1997)


In [182]:
print(f'Embeddings: {n_train} users, {m_train} movies')
print(f'Dataset shape: {X_train.shape}')
print(f'Target shape: {y_train.shape}')

Embeddings: 610 users, 8762 movies
Dataset shape: (75627, 4)
Target shape: (75627,)


In [183]:
print(f'Test Data: {n_test} users, {m_test} movies')
print(f'Dataset shape: {X_test.shape}')
print(f'Target shape: {y_test.shape}')

Test Data: 610 users, 5672 movies
Dataset shape: (25209, 4)
Target shape: (25209,)


In [184]:
# Implementation 
from keras.layers import Input, Embedding, Concatenate, Flatten, Dense, Dot, Add, Multiply, Subtract, Average
from keras.models import Model
from keras.callbacks import EarlyStopping

In [185]:
def embedding_model(hidden_units, user_embedding_dim, user_max_cat_value, movie_embedding_dim, movie_max_cat_value,merging_method):
    # Each instance will consist of two inputs: a single user id, and a single movie id
    user_id_input = Input(shape=(1,), name='user_id')
    movie_id_input = Input(shape=(1,), name='movie_id')
    # Embeddings
    user_embedded = Embedding(user_max_cat_value+1, user_embedding_dim, 
                                       input_length=1, name='user_embedding')(user_id_input)
    movie_embedded = Embedding(movie_max_cat_value+1, movie_embedding_dim, 
                                        input_length=1, name='movie_embedding')(movie_id_input)
    # merging the embeddings
    if merging_method=='concatenate':
        merged = Concatenate()([user_embedded, movie_embedded])
    if merging_method=='dot_product':
        merged =Dot(name = 'dot_product', normalize = True, axes = 2)([user_embedded, movie_embedded])
    if merging_method=='add':
        merged =Add()([user_embedded, movie_embedded])
    if merging_method=='substract':
        merged=Subtract()([user_embedded, movie_embedded])
    if merging_method=='multiply':
        merged=Multiply()([user_embedded, movie_embedded])
    if merging_method=='average':
        merged=Average()([user_embedded, movie_embedded])
    out = Flatten()(merged)

    # Add one or more hidden layers
    for n_hidden in hidden_units:
        out = Dense(n_hidden, activation='relu')(out)

    # A single output: our predicted rating
    out = Dense(1, activation='linear', name='prediction')(out)
    return Model(inputs = [user_id_input, movie_id_input],outputs = out)

In [186]:
hidden_units = (100,50) #same as in pytorch model
movie_embedding_dim = 50 #same as in pytorch model
user_embedding_dim = 50  #same as in pytorch model
user_max_cat_value = n
movie_max_cat_value=m 
model_concatenate=embedding_model(hidden_units, user_embedding_dim, user_max_cat_value, movie_embedding_dim, movie_max_cat_value, merging_method='concatenate')
# model_concatenate.summary(line_length=88)

In [187]:
# compiling / new version
model_concatenate.compile(optimizer = 'Adam',loss='MSE',metrics=['MAE'])
# early stopping
es=EarlyStopping(monitor='val_MAE', min_delta=0, patience=0, verbose=0, mode='min', baseline=None, restore_best_weights=False)
# training and using the pre-defined train and test data
trained_model_concatenate= model_concatenate.fit(x=[X_train.user_id, X_train.movie_id], y=y_train, batch_size=500,epochs=10, verbose=2, validation_data=[[X_test.user_id, X_test.movie_id],y_test], callbacks=[es])

Train on 75627 samples, validate on 25209 samples
Epoch 1/10
 - 3s - loss: 3.0598 - MAE: 1.2792 - val_loss: 1.3565 - val_MAE: 0.9282
Epoch 2/10
 - 3s - loss: 0.7313 - MAE: 0.6582 - val_loss: 1.4151 - val_MAE: 0.9443


In [188]:
#  Epch 4/19 - 48s - loss: 0.6777 - MAE: 0.6295 - val_loss: 0.7709 - val_MAE: 0.6740

In [189]:
model_dot=embedding_model(hidden_units, user_embedding_dim, user_max_cat_value, movie_embedding_dim, movie_max_cat_value, merging_method='dot_product')

In [190]:
# compiling 
model_dot.compile(optimizer = 'Adam',loss='MSE',metrics=['MAE'])
# early stopping
es=EarlyStopping(monitor='val_MAE', min_delta=0, patience=0, verbose=0, mode='min', baseline=None, restore_best_weights=False)
# training and using the pre-defined train and test data
trained_model_dot= model_dot.fit(x=[X_train.user_id, X_train.movie_id], y=y_train, batch_size=500,epochs=10, verbose=2, validation_data=[[X_test.user_id, X_test.movie_id],y_test], callbacks=[es])

Train on 75627 samples, validate on 25209 samples
Epoch 1/10
 - 3s - loss: 4.4732 - MAE: 1.6629 - val_loss: 1.0828 - val_MAE: 0.8253
Epoch 2/10
 - 3s - loss: 1.0364 - MAE: 0.8117 - val_loss: 1.1419 - val_MAE: 0.8593


In [191]:
# Epoch 2/10  - 57s - loss: 1.0672 - MAE: 0.8238 - val_loss: 1.0869 - val_MAE: 0.8376
# around the same

In [192]:
model_add=embedding_model(hidden_units, user_embedding_dim, user_max_cat_value, movie_embedding_dim, movie_max_cat_value, merging_method='add')
# compiling 
model_add.compile(optimizer = 'Adam',loss='MSE',metrics=['MAE'])
# early stopping
es=EarlyStopping(monitor='val_MAE', min_delta=0, patience=0, verbose=0, mode='min', baseline=None, restore_best_weights=False)
# training and using the pre-defined train and test data
trained_model_add= model_add.fit(x=[X_train.user_id, X_train.movie_id], y=y_train, batch_size=500,epochs=10, verbose=2, validation_data=[[X_test.user_id, X_test.movie_id],y_test], callbacks=[es])

Train on 75627 samples, validate on 25209 samples
Epoch 1/10
 - 3s - loss: 3.2245 - MAE: 1.3192 - val_loss: 1.4134 - val_MAE: 0.9504
Epoch 2/10
 - 3s - loss: 0.7324 - MAE: 0.6590 - val_loss: 1.4445 - val_MAE: 0.9573


In [193]:
# Epoch 3/10 - 32s - loss: 0.6899 - MAE: 0.6363 - val_loss: 0.7743 - val_MAE: 0.6801

In [194]:
model_substract=embedding_model(hidden_units, user_embedding_dim, user_max_cat_value, movie_embedding_dim, movie_max_cat_value, merging_method='substract')
# compiling 
model_substract.compile(optimizer = 'Adam',loss='MSE',metrics=['MAE'])
# early stopping
es=EarlyStopping(monitor='val_MAE', min_delta=0, patience=0, verbose=0, mode='min', baseline=None, restore_best_weights=False)
# training and using the pre-defined train and test data
trained_model_substract= model_substract.fit(x=[X_train.user_id, X_train.movie_id], y=y_train, batch_size=500,epochs=10, verbose=2, validation_data=[[X_test.user_id, X_test.movie_id],y_test], callbacks=[es])

Train on 75627 samples, validate on 25209 samples
Epoch 1/10
 - 3s - loss: 3.2837 - MAE: 1.3305 - val_loss: 1.4433 - val_MAE: 0.9620
Epoch 2/10
 - 3s - loss: 0.7352 - MAE: 0.6599 - val_loss: 1.4790 - val_MAE: 0.9682


In [195]:
# Epoch 7/10  - 46s - loss: 0.6186 - MAE: 0.5960 - val_loss: 0.7775 - val_MAE: 0.6773

In [196]:
model_multiply=embedding_model(hidden_units, user_embedding_dim, user_max_cat_value, movie_embedding_dim, movie_max_cat_value, merging_method='multiply')
# compiling 
model_multiply.compile(optimizer = 'Adam',loss='MSE',metrics=['MAE'])
# early stopping
es=EarlyStopping(monitor='val_MAE', min_delta=0, patience=0, verbose=0, mode='min', baseline=None, restore_best_weights=False)
# training and using the pre-defined train and test data
history_multiply= model_multiply.fit(x=[X_train.user_id, X_train.movie_id], y=y_train, batch_size=500,epochs=10, verbose=2, validation_data=[[X_test.user_id, X_test.movie_id],y_test], callbacks=[es])

Train on 75627 samples, validate on 25209 samples
Epoch 1/10
 - 3s - loss: 4.1772 - MAE: 1.5922 - val_loss: 1.0819 - val_MAE: 0.8247
Epoch 2/10
 - 3s - loss: 0.9186 - MAE: 0.7515 - val_loss: 1.2040 - val_MAE: 0.8827


In [197]:
# Epoch 3/10  - 46s - loss: 0.4674 - MAE: 0.5177 - val_loss: 0.8758 - val_MAE: 0.7270

In [198]:
# example prediction

In [199]:
data=X
data['rating']=y

In [200]:
data.head()

Unnamed: 0,user_id,user_id_old,movie_id,title,rating
0,0,1,0,Toy Story (1995),4.0
1,1,5,0,Toy Story (1995),4.0
2,2,7,0,Toy Story (1995),4.5
3,3,15,0,Toy Story (1995),2.5
4,4,17,0,Toy Story (1995),4.5


In [201]:
# User 1
ratings_per_user = data.groupby('user_id').size()
uid = 1
user_ratings = data[data.user_id_old==uid]
print("User #{} has rated {} movies (avg. rating = {:.1f}):".format(
    uid, len(user_ratings), user_ratings['rating'].mean(),
))
user_ratings.sort_values(by='rating', ascending=False)

User #1 has rated 232 movies (avg. rating = 4.4):


Unnamed: 0,user_id,user_id_old,movie_id,title,rating
16250,0,1,231,M*A*S*H (a.k.a. MASH) (1970),5.0
14053,0,1,185,Excalibur (1981),5.0
9066,0,1,89,Indiana Jones and the Last Crusade (1989),5.0
9206,0,1,90,Pink Floyd: The Wall (1982),5.0
14254,0,1,190,From Russia with Love (1963),5.0
...,...,...,...,...,...
12997,0,1,170,"Mummy, The (1999)",2.0
11663,0,1,143,Toys (1992),2.0
11991,0,1,148,I Still Know What You Did Last Summer (1998),2.0
12117,0,1,152,Psycho (1998),2.0


In [203]:
# predicting for movies from the test data
movies_from_test=X_test.movie_id[X_test.user_id_old==uid].unique()
# prediction 
pred_concatenate=model_concatenate.predict([[uid]*len(movies_from_test),movies_from_test])
pred_dot=model_dot.predict([[uid]*len(movies_from_test),movies_from_test])
pred_add=model_add.predict([[uid]*len(movies_from_test),movies_from_test])
pred_substract=model_substract.predict([[uid]*len(movies_from_test),movies_from_test])
pred_multiply=model_multiply.predict([[uid]*len(movies_from_test),movies_from_test])

In [204]:
user_data_test=test[test.user_id==uid][cols]
user_data_test['model_concatenate']=pred_concatenate
user_data_test['model_dot']=pred_dot
user_data_test['model_add']=pred_add
user_data_test['model_substract']=pred_substract
user_data_test['model_multiply']=pred_multiply
user_data_test.sort_values(by='rating', ascending=False)

Unnamed: 0,user_id,movie_id,rating,model_concatenate,model_dot,model_add,model_substract,model_multiply
115,1,1954,5.0,3.681687,3.270614,3.846559,3.915456,3.834639
156,1,2427,5.0,3.831679,3.571286,4.03293,3.872842,3.731804
8,1,151,5.0,4.196156,4.124562,4.430522,4.303009,4.001982
88,1,1282,5.0,3.449518,3.444354,3.661545,3.428727,3.237103
147,1,2329,5.0,3.878249,3.170752,3.852866,3.870592,3.628895
103,1,1617,5.0,3.500389,3.207579,3.383346,3.50128,3.02204
179,1,2700,5.0,3.803867,3.389079,4.076006,3.833677,3.925233
135,1,2116,5.0,4.39679,3.839192,4.539169,4.510848,3.7695
158,1,2459,5.0,3.56859,3.506024,3.672828,3.533752,3.444415
149,1,2353,5.0,3.556995,3.499892,3.78276,3.525619,3.565284


In [210]:
movie_name=lambda movie: data.title[data.movie_id==movie]
top5_model_concatenate_test=user_data_test.nlargest(5,'model_concatenate').movie_id.map(movie_name)
top5_model_dot_test=user_data_test.nlargest(5,'model_dot').movie_id.map(movie_name)
top5_model_add_test=user_data_test.nlargest(5,'model_add').movie_id.map(movie_name)
top5_model_substract_test=user_data_test.nlargest(5,'model_substract').movie_id.map(movie_name)
top5_model_multiply_test=user_data_test.nlargest(5,'model_multiply').movie_id.map(movie_name)

pd.DataFrame({'top5_model_concatenate': top5_model_concatenate_test.values, 'top5_model_dot':top5_model_dot_test.values, 'top5_model_add':  top5_model_add_test.values, 'top5_model_substract': top5_model_substract_test.values, 'top5_model_mulitply': top5_model_multiply_test.values})

Unnamed: 0,top5_model_concatenate,top5_model_dot,top5_model_add,top5_model_substract,top5_model_mulitply
0,"80772 Odessa File, The (1974) 80773 Odes...","85367 Story of Women (Affaire de femmes, Un...",70504 Dark Days (2000) 70505 Dark Days (...,79094 Harold and Maude (1971) 79095 Haro...,59893 The Devil's Advocate (1997) 59894 ...
1,70504 Dark Days (2000) 70505 Dark Days (...,12101 Very Bad Things (1998) 12102 Very ...,"80772 Odessa File, The (1974) 80773 Odes...",70504 Dark Days (2000) 70505 Dark Days (...,"85367 Story of Women (Affaire de femmes, Un..."
2,79094 Harold and Maude (1971) 79095 Haro...,71869 Backdraft (1991) 71870 Backdraft (...,79094 Harold and Maude (1971) 79095 Haro...,"80772 Odessa File, The (1974) 80773 Odes...","53224 Bourne Identity, The (2002) 53225 ..."
3,"53224 Bourne Identity, The (2002) 53225 ...",79988 Shoot 'Em Up (2007) 79989 Shoot 'E...,12101 Very Bad Things (1998) 12102 Very ...,"45675 Nightmare on Elm Street, A (1984) 456...",79094 Harold and Maude (1971) 79095 Haro...
4,59893 The Devil's Advocate (1997) 59894 ...,"53224 Bourne Identity, The (2002) 53225 ...","85367 Story of Women (Affaire de femmes, Un...",80058 The Raid: Redemption (2011) 80059 ...,"30685 Specialist, The (1994) 30686 Speci..."


In [172]:
# the intersection between the above columns are greater then for DLRM_train_test_split