In [35]:
import pandas as pd
import zipfile
import numpy as np 

In [36]:
#  loading data
csv_ratings='ml-latest-small/ratings.csv'
csv_movies='ml-latest-small/movies.csv'
def get_data(csv_ratings,csv_movies):
    zf = zipfile.ZipFile('/home/elena/Downloads/ml-latest-small.zip')
    # reading ratings file:
    r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
    ratings = pd.read_csv(zf.open(csv_ratings), names=r_cols)
    m_cols=['movie_id', 'title', 'genre']
    movies = pd.read_csv(zf.open(csv_movies), names=m_cols)
    # merging ratings and movies
    ratings=pd.merge(ratings,movies,on='movie_id')
    zz = zipfile.ZipFile('/home/elena/Downloads/ml-100k.zip')
    # reading users file:
    u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
    users = pd.read_csv(zz.open('ml-100k/u.user'), sep='|', names=u_cols,encoding='latin-1')
    return pd.merge(users,ratings, on='user_id')

In [47]:
data=get_data(csv_ratings,csv_movies)

In [48]:
# error I did before was to split the data in train & test randomly, but since the data is timestamped, I should use the first e.g. 80% for training, and the last 20% for testing! 
# To do so, I need to sort the data via unix_timestamp

In [59]:
def train_test(data):
    data=data.sort_values(by=['unix_timestamp'], ascending=True)
    data=data.reset_index(drop=True)
    split=int(data.shape[0]*0.75)
    train=data.iloc[0:split]
    test=data.iloc[split:data.shape[0]]
    return (data,train,test)

In [60]:
data, train, test=train_test(data)

In [None]:
# I indexed, and encoded cat features, BEFORE splitting, because I don't know how to handle new unseen index / encodings

In [65]:
def indexing(data,train,test):
    unique_movies = data.movie_id.unique() # returns a np array
    movie_to_index = {old: new for new, old in enumerate(unique_movies)} # indexing movie_id, tart at 0
    index_to_movie = {idx: movie for movie, idx in movie_to_index.items()}
    new_movies = data.movie_id.map(movie_to_index) # replaces movie_id with coresp. index
    data['movie_index']=new_movies
    train['movie_index']=train.movie_id.map(movie_to_index)
    test['movie_index']=test.movie_id.map(movie_to_index)
    return (data,train,test)

In [71]:
data, train, test=indexing(data,train,test)
y_train=train['rating']
y_test=test['rating']
X_train=train.drop('rating', axis=1)
X_test=test.drop('rating', axis=1)

In [83]:
# Implementation 
from keras.layers import Input, Embedding, Concatenate, Flatten, Dense, Dot, Add, Multiply, Subtract, Average, LSTM
from keras.models import Model
from keras.callbacks import EarlyStopping

In [73]:
def embedding_model(hidden_units, user_embedding_dim, user_max_cat_value, movie_embedding_dim, movie_max_cat_value,merging_method):
    # Each instance will consist of two inputs: a single user id, and a single movie id
    user_id_input = Input(shape=(1,), name='user_id')
    movie_id_input = Input(shape=(1,), name='movie_id')
    # Embeddings
    user_embedded = Embedding(user_max_cat_value+1, user_embedding_dim, 
                                       input_length=1, name='user_embedding')(user_id_input)
    movie_embedded = Embedding(movie_max_cat_value+1, movie_embedding_dim, 
                                        input_length=1, name='movie_embedding')(movie_id_input)
    # merging the embeddings
    if merging_method=='concatenate':
        merged = Concatenate()([user_embedded, movie_embedded])
    if merging_method=='dot_product':
        merged =Dot(name = 'dot_product', normalize = True, axes = 2)([user_embedded, movie_embedded])
    if merging_method=='add':
        merged =Add()([user_embedded, movie_embedded])
    if merging_method=='substract':
        merged=Subtract()([user_embedded, movie_embedded])
    if merging_method=='multiply':
        merged=Multiply()([user_embedded, movie_embedded])
    if merging_method=='average':
        merged=Average()([user_embedded, movie_embedded])
    out = Flatten()(merged)

    # Add one or more hidden layers
    for n_hidden in hidden_units:
        out = Dense(n_hidden, activation='relu')(out)

    # A single output: our predicted rating
    out = Dense(1, activation='linear', name='prediction')(out)
    model = Model(inputs = [user_id_input, movie_id_input],outputs = out)
    model.compile(optimizer = 'Adam',loss='MSE',metrics=['MAE'])
    return model

In [74]:
# parameters for the models used
hidden_units = (100,50) #same as in pytorch model
movie_embedding_dim = 50 #same as in pytorch model
user_embedding_dim = 50  #same as in pytorch model
user_max_cat_value = data.user_id.max()
movie_max_cat_value=max(train.movie_index.max(), test.movie_index.max())
es=EarlyStopping(monitor='val_MAE', min_delta=0, patience=0, verbose=0, mode='min', baseline=None, restore_best_weights=False)

In [84]:
mergemethod=['concatenate','dot_product','add','substract', 'multiply','average']
summary=pd.DataFrame(columns=['merge','val_MAE', 'MAE','epoch','val_loss', 'loss'])
merge,epoch,val_MAE,MAE,loss,val_loss=[],[],[],[],[],[]
# for prediction
uid=1
movies_test=X_test.movie_index[X_test.user_id==uid]
predictions=pd.DataFrame(movies_test.values)
predictions.columns=['movie_index']
# looping through the merging methods
for m in mergemethod:
    model=embedding_model(hidden_units, user_embedding_dim, user_max_cat_value, movie_embedding_dim, movie_max_cat_value,merging_method=m)
    history=model.fit(x=[X_train.user_id, X_train.movie_index], y=y_train, batch_size=500,epochs=10, verbose=0, validation_data=[[X_test.user_id, X_test.movie_index],y_test], callbacks=[es])
    # predicting for user uid
    pred=model.predict([[uid]*len(movies_test),movies_test.index])
    predictions[m]=pred
    # collecting MAE's and loss
    merge.append(m)
    n=len(history.epoch)
    epoch.append(n)
    val_MAE.append(history.history['val_MAE'][n-1])
    MAE.append(history.history['MAE'][n-1])
    loss.append(history.history['loss'][n-1])
    val_loss.append(history.history['val_loss'][n-1])
summary['merge']=merge 
summary['val_MAE']=val_MAE 
summary['epoch']=epoch
summary['MAE']=MAE 
summary['loss']=loss 
summary['val_loss']=val_loss

In [76]:
summary

Unnamed: 0,merge,val_MAE,MAE,epoch,val_loss,loss
0,concatenate,1.037041,0.578878,10,1.585603,0.582871
1,dot_product,0.861144,0.816102,2,1.170489,1.04317
2,add,1.190237,0.599088,8,1.971669,0.619108
3,substract,1.120649,0.613489,6,1.783306,0.644329
4,multiply,0.896178,0.733016,2,1.198117,0.873399
5,average,1.155865,0.625325,8,1.872036,0.667989


In [86]:
mergemethod=['concatenate','dot_product','add','substract', 'multiply','average']
summary=pd.DataFrame(columns=['merge','val_MAE', 'MAE','epoch','val_loss', 'loss'])
merge,epoch,val_MAE,MAE,loss,val_loss=[],[],[],[],[],[]
# for prediction
uid=1
movies_test=X_test.movie_index[X_test.user_id==uid]
predictions=pd.DataFrame(movies_test.values)
predictions.columns=['movie_index']
# looping through the merging methods
for m in mergemethod:
    model=embedding_model(hidden_units, user_embedding_dim, user_max_cat_value, movie_embedding_dim, movie_max_cat_value,merging_method=m)
    history=model.fit(x=[data.user_id, data.movie_index], y=data.rating, batch_size=500,epochs=10, verbose=0, validation_split=0.25, callbacks=[es])
    # predicting for user uid
    pred=model.predict([[uid]*len(movies_test),movies_test.index])
    predictions[m]=pred
    # collecting MAE's and loss
    merge.append(m)
    n=len(history.epoch)
    epoch.append(n)
    val_MAE.append(history.history['val_MAE'][n-1])
    MAE.append(history.history['MAE'][n-1])
    loss.append(history.history['loss'][n-1])
    val_loss.append(history.history['val_loss'][n-1])
summary['merge']=merge 
summary['val_MAE']=val_MAE 
summary['epoch']=epoch
summary['MAE']=MAE 
summary['loss']=loss 
summary['val_loss']=val_loss

In [87]:
summary

Unnamed: 0,merge,val_MAE,MAE,epoch,val_loss,loss
0,concatenate,1.0755,0.556109,10,1.673567,0.540143
1,dot_product,0.863385,0.810375,2,1.16589,1.036259
2,add,1.167191,0.566769,8,1.948975,0.560297
3,substract,1.195468,0.56931,9,2.022678,0.566003
4,multiply,0.90597,0.722775,2,1.216452,0.854723
5,average,0.985534,0.61132,10,1.433625,0.643704


In [79]:
# much worse then before! dot product and multiply turned out to be the 'best'
# lets try a LSTM layer

In [81]:
def embeddingLSTM_model(hidden_units,user_embedding_dim, user_max_cat_value, movie_embedding_dim, movie_max_cat_value,merging_method):
    # Each instance will consist of two inputs: a single user id, and a single movie id
    user_id_input = Input(shape=(1,), name='user_id')
    movie_id_input = Input(shape=(1,), name='movie_id')
    # Embeddings
    user_embedded = Embedding(user_max_cat_value+1, user_embedding_dim, 
                                       input_length=1, name='user_embedding')(user_id_input)
    movie_embedded = Embedding(movie_max_cat_value+1, movie_embedding_dim, 
                                        input_length=1, name='movie_embedding')(movie_id_input)
    # merging the embeddings
    if merging_method=='concatenate':
        merged = Concatenate()([user_embedded, movie_embedded])
    if merging_method=='dot_product':
        merged =Dot(name = 'dot_product', normalize = True, axes = 2)([user_embedded, movie_embedded])
    if merging_method=='add':
        merged =Add()([user_embedded, movie_embedded])
    if merging_method=='substract':
        merged=Subtract()([user_embedded, movie_embedded])
    if merging_method=='multiply':
        merged=Multiply()([user_embedded, movie_embedded])
    if merging_method=='average':
        merged=Average()([user_embedded, movie_embedded])
    #out = Flatten()(merged)

    # Add one or more hidden layers
    for n_hidden in hidden_units:
        out = LSTM(n_hidden, activation='relu')(merged)# LSTM instead of Dense

    # A single output: our predicted rating
    out = Dense(1, activation='linear', name='prediction')(out)
    model = Model(inputs = [user_id_input, movie_id_input],outputs = out)
    model.compile(optimizer = 'Adam',loss='MSE',metrics=['MAE'])
    return model

In [88]:
mergemethod=['concatenate','dot_product','add','substract', 'multiply','average']
summary=pd.DataFrame(columns=['merge','val_MAE', 'MAE','epoch','val_loss', 'loss'])
merge,epoch,val_MAE,MAE,loss,val_loss=[],[],[],[],[],[]
# for prediction
uid=1
movies_test=X_test.movie_index[X_test.user_id==uid]
predictions=pd.DataFrame(movies_test.values)
predictions.columns=['movie_index']
# looping through the merging methods
for m in mergemethod:
    model=embeddingLSTM_model(hidden_units, user_embedding_dim, user_max_cat_value, movie_embedding_dim, movie_max_cat_value,merging_method=m)
    history=model.fit(x=[data.user_id, data.movie_index], y=data.rating, batch_size=500,epochs=10, verbose=0, validation_split=0.75, callbacks=[es])
    # predicting for user uid
    pred=model.predict([[uid]*len(movies_test),movies_test.index])
    predictions[m]=pred
    # collecting MAE's and loss
    merge.append(m)
    n=len(history.epoch)
    epoch.append(n)
    val_MAE.append(history.history['val_MAE'][n-1])
    MAE.append(history.history['MAE'][n-1])
    loss.append(history.history['loss'][n-1])
    val_loss.append(history.history['val_loss'][n-1])
summary['merge']=merge 
summary['val_MAE']=val_MAE 
summary['epoch']=epoch
summary['MAE']=MAE 
summary['loss']=loss 
summary['val_loss']=val_loss

In [89]:
summary

Unnamed: 0,merge,val_MAE,MAE,epoch,val_loss,loss
0,concatenate,2.696847,0.767699,3,8.523975,0.948813
1,dot_product,1.757958,0.436966,10,3.928167,0.338548
2,add,2.678453,0.763909,3,8.417517,0.937233
3,substract,2.605697,0.658283,10,8.061807,0.708656
4,multiply,2.190618,0.557637,10,5.717753,0.524398
5,average,2.527856,0.657762,10,7.621267,0.705806


In [None]:
# much much much worse!!