In [3]:
import pandas as pd
import zipfile
import numpy as np 

In [4]:
#  loading data
csv_ratings='ml-latest-small/ratings.csv'
csv_movies='ml-latest-small/movies.csv'
def get_data_ratings(csv_ratings,csv_movies):
    zf = zipfile.ZipFile('/home/elena/Downloads/ml-latest-small.zip')
    # reading ratings file:
    r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
    ratings = pd.read_csv(zf.open(csv_ratings), names=r_cols)
    m_cols=['movie_id', 'title', 'genre']
    movies = pd.read_csv(zf.open(csv_movies), names=m_cols)
    # merging ratings and movies
    # ratings=pd.merge(ratings,movies,on='movie_id')
    return pd.merge(ratings,movies,on='movie_id')

In [5]:
ratings=get_data_ratings(csv_ratings,csv_movies)

In [6]:
# loading train / test data
def train_test_data(ratings):
    unique_movies = ratings.movie_id.unique() # returns a np array
    movie_to_index = {old: new for new, old in enumerate(unique_movies)} # indexing movie_id, tart at 0
    index_to_movie = {idx: movie for movie, idx in movie_to_index.items()}
    new_movies = ratings.movie_id.map(movie_to_index) # replaces movie_id with coresp. index
    ratings['movie_index']=new_movies

    train=pd.read_pickle('/home/elena/Downloads/traindata.pkl')
    test=pd.read_pickle('/home/elena/Downloads/testdata.pkl')
    train['movie_index']=train.movie_id.map(movie_to_index)
    test['movie_index']=test.movie_id.map(movie_to_index)
    return (train,test)

In [5]:
train, test = train_test_data(ratings)

In [6]:
y_train=train.rating
y_test=test.rating
X_train=train.drop('rating', axis=1)
X_test=test.drop('rating', axis=1)

In [11]:
# Implementation 
from keras.layers import Input, Embedding, Concatenate, Flatten, Dense, Dot, Add, Multiply, Subtract, Average, LSTM
from keras.models import Model
from keras.callbacks import EarlyStopping

Using TensorFlow backend.


In [12]:
seed=1
np.random.seed(seed)

In [89]:
def embedding_model(hidden_units, user_embedding_dim, user_max_cat_value, movie_embedding_dim, movie_max_cat_value,merging_method, recurrent):
    # Each instance will consist of two inputs: a single user id, and a single movie id
    user_id_input = Input(shape=(1,), name='user_id')
    movie_id_input = Input(shape=(1,), name='movie_id')
    # Embeddings
    user_embedded = Embedding(user_max_cat_value+1, user_embedding_dim, 
                                       input_length=1, name='user_embedding')(user_id_input)
    movie_embedded = Embedding(movie_max_cat_value+1, movie_embedding_dim, 
                                        input_length=1, name='movie_embedding')(movie_id_input)
    # merging the embeddings
    if merging_method=='concatenate':
        merged = Concatenate()([user_embedded, movie_embedded])
    if merging_method=='dot_product':
        merged =Dot(name = 'dot_product', normalize = True, axes = 2)([user_embedded, movie_embedded])
    if merging_method=='add':
        merged =Add()([user_embedded, movie_embedded])
    if merging_method=='substract':
        merged=Subtract()([user_embedded, movie_embedded])
    if merging_method=='multiply':
        merged=Multiply()([user_embedded, movie_embedded])
    if merging_method=='average':
        merged=Average()([user_embedded, movie_embedded])
    if recurrent == None:
        out = Flatten()(merged)
        for n_hidden in hidden_units:
            out = Dense(n_hidden, activation='relu')(out)
    else: 
        for n_hidden in hidden_units:
            out = LSTM(n_hidden, activation='relu')(merged)

    # A single output: our predicted rating
    out = Dense(1, activation='linear', name='prediction')(out)
    model = Model(inputs = [user_id_input, movie_id_input],outputs = out)
    model.compile(optimizer = 'Adam',loss='MSE',metrics=['MAE'])
    return model

# fitting models and predicting
def result(uid, mergemethod, X_train, X_test, y_train, y_test, rec):
    summary=pd.DataFrame(columns=['merge','val_MAE', 'MAE','epoch','val_loss', 'loss'])
    merge,epoch,val_MAE,MAE,loss,val_loss=[],[],[],[],[],[]
    # for prediction
    movies_test=X_test.movie_index[X_test.user_id==uid]
    predictions=pd.DataFrame(movies_test.values)
    predictions.columns=['movie_index']
    # looping through the merging methods
    for m in mergemethod:
        model=embedding_model(hidden_units, user_embedding_dim, user_max_cat_value, movie_embedding_dim, movie_max_cat_value,merging_method=m, recurrent=rec)
        history=model.fit(x=[X_train.user_id, X_train.movie_index], y=y_train, batch_size=500,epochs=10, verbose=0, validation_data=[[X_test.user_id, X_test.movie_index],y_test], callbacks=[es])
        # predicting for user uid
        pred=model.predict([[uid]*len(movies_test),movies_test.index])
        predictions[m]=pred
        # collecting MAE's and loss
        merge.append(m)
        n=len(history.epoch)
        epoch.append(n)
        val_MAE.append(history.history['val_MAE'][n-1])
        MAE.append(history.history['MAE'][n-1])
        loss.append(history.history['loss'][n-1])
        val_loss.append(history.history['val_loss'][n-1])
    summary['merge']=merge 
    summary['val_MAE']=val_MAE 
    summary['epoch']=epoch
    summary['MAE']=MAE 
    summary['loss']=loss 
    summary['val_loss']=val_loss
    top_5=pd.DataFrame()
    for n in mergemethod:
        top_5[n]=predictions.nlargest(5,n).movie_index.values
    return (summary, top_5)

In [70]:
# parameters for the models used
hidden_units = (100,50) #same as in pytorch model
movie_embedding_dim = 50 #same as in pytorch model
user_embedding_dim = 50  #same as in pytorch model
user_max_cat_value = ratings.user_id.max()
movie_max_cat_value=max(X_train.movie_index.max(), X_test.movie_index.max())
es=EarlyStopping(monitor='val_MAE', min_delta=0, patience=0, verbose=0, mode='min', baseline=None, restore_best_weights=False)
# for fitting and predicting
mergemethod=['concatenate','dot_product','add','substract', 'multiply','average']

In [71]:
summary1, prediction1=result(1,mergemethod, X_train, X_test, y_train, y_test, rec= None)

In [72]:
summary1

Unnamed: 0,merge,val_MAE,MAE,epoch,val_loss,loss
0,concatenate,0.666741,0.586439,9,0.758934,0.600377
1,dot_product,0.839333,0.819236,2,1.091499,1.053186
2,add,0.683493,0.626266,4,0.779204,0.67193
3,substract,0.68377,0.6379,3,0.779475,0.693045
4,multiply,0.74174,0.52081,3,0.906672,0.470945
5,average,0.673603,0.627302,4,0.769251,0.674279


In [114]:
# summary first run - best models are concat and average

Unnamed: 0,merge,val_MAE,MAE,epoch,val_loss,loss
0,concatenate,0.663808,0.587815,9,0.762902,0.60277
1,dot_product,0.837408,0.8194,2,1.091396,1.055124
2,add,0.678036,0.637935,3,0.775935,0.692228
3,substract,0.675556,0.619166,5,0.767822,0.659461
4,multiply,0.735127,0.525711,3,0.895239,0.478696
5,average,0.673198,0.636541,3,0.767651,0.690644


In [73]:
prediction1

Unnamed: 0,concatenate,dot_product,add,substract,multiply,average
0,70,147,147,147,164,70
1,114,164,70,114,53,147
2,147,127,179,226,127,222
3,226,35,219,222,103,219
4,222,161,114,70,87,103


In [74]:
# here we repeat the above steps but prepare the data for the model by using an Integer Encoding (from kaggle) instead of indexing the movie_id

In [75]:
from sklearn.preprocessing import LabelEncoder
# loading train / test data
def train_test_data_enc(ratings):
    train=pd.read_pickle('/home/elena/Downloads/traindata.pkl')
    test=pd.read_pickle('/home/elena/Downloads/testdata.pkl')
    label_encoder = LabelEncoder()
    label_encoder.fit(ratings['movie_id'])
    train['movie_index'] = label_encoder.transform(train['movie_id']) # calling it index to use the function above!
    test['movie_index'] = label_encoder.transform(test['movie_id'])
    return (train,test)

In [77]:
train_enc, test_enc=train_test_data_enc(ratings)
y_train_enc=train_enc.rating
y_test_enc=test_enc.rating
X_train_enc=train_enc.drop('rating',axis=1)
X_test_enc=test_enc.drop('rating',axis=1)
movie_max_cat_value_enc=max(X_train_enc.movie_index.max(), X_test_enc.movie_index.max())

In [80]:
summary_enc, prediction_enc=result(1,mergemethod, X_train_enc, X_test_enc, y_train_enc, y_test_enc,rec= None)

In [81]:
pd.merge(summary1[['merge','val_MAE','MAE']],summary_enc[['merge','val_MAE','MAE']],on='merge')

Unnamed: 0,merge,val_MAE_x,MAE_x,val_MAE_y,MAE_y
0,concatenate,0.666741,0.586439,0.675895,0.627993
1,dot_product,0.839333,0.819236,0.83447,0.82509
2,add,0.683493,0.626266,0.674154,0.626612
3,substract,0.68377,0.6379,0.678025,0.62605
4,multiply,0.74174,0.52081,0.745583,0.524609
5,average,0.673603,0.627302,0.678869,0.635407


In [82]:
# let's now use a recurrent neural network -> LSTM
# we will use the data with movie_index since we got a little better result there

In [91]:
summary_lstm, prediction_lstm=result(1,mergemethod, X_train, X_test, y_train, y_test, rec= 1)

In [93]:
pd.merge(summary1[['merge','val_MAE','MAE']],summary_lstm[['merge','val_MAE','MAE']],on='merge')
# almost the same MAE

Unnamed: 0,merge,val_MAE_x,MAE_x,val_MAE_y,MAE_y
0,concatenate,0.685221,0.636862,0.686137,0.637342
1,dot_product,0.713182,0.381853,0.721797,0.416107
2,add,0.696987,0.625092,0.69923,0.627272
3,substract,0.697403,0.640078,0.695633,0.632508
4,multiply,0.753478,0.510704,0.738477,0.533977
5,average,0.689902,0.623564,0.689554,0.629755


In [96]:
prediction_lstm

Unnamed: 0,concatenate,dot_product,add,substract,multiply,average
0,70,70,147,147,70,70
1,147,115,114,114,226,147
2,75,147,222,222,222,114
3,114,219,70,70,127,103
4,87,114,75,226,75,164


In [15]:
# Let's now predict movies and time!

In [16]:
train, test = train_test_data(ratings)
y_train=train[['rating','unix_timestamp']]
y_test=test[['rating','unix_timestamp']]
X_train=train.drop(['rating','unix_timestamp'], axis=1)
X_test=test.drop(['rating','unix_timestamp'], axis=1)

In [17]:
# normalizing unix_timestamp
max_train=y_train.unix_timestamp.max()
y_train['norm']=[float(i)/max_train for i in y_train.unix_timestamp]
max_test=y_test.unix_timestamp.max()
y_test['norm']=[float(i)/max_test for i in y_test.unix_timestamp]

In [18]:
y_train.norm.max(),y_train.norm.min(), y_test.norm.max(),y_test.norm.min()

(1.0, 0.5385275978108841, 1.0, 0.5385128227888003)

In [26]:
# parameters for the models used
hidden_units = (100,50) #same as in pytorch model
movie_embedding_dim = 50 #same as in pytorch model
user_embedding_dim = 50  #same as in pytorch model
user_max_cat_value = ratings.user_id.max()
movie_max_cat_value=X_test.movie_index.max()
es=EarlyStopping(monitor='val_MAE', min_delta=0, patience=0, verbose=0, mode='min', baseline=None, restore_best_weights=False)
# the model 
#
#
def embedding_model_mult_out(hidden_units, user_embedding_dim, user_max_cat_value, movie_embedding_dim, movie_max_cat_value,merging_method, recurrent):
    # Each instance will consist of two inputs: a single user id, and a single movie id
    user_id_input = Input(shape=(1,), name='user_id')
    movie_id_input = Input(shape=(1,), name='movie_id')
    # Embeddings
    user_embedded = Embedding(user_max_cat_value+1, user_embedding_dim, 
                                       input_length=1, name='user_embedding')(user_id_input)
    movie_embedded = Embedding(movie_max_cat_value+1, movie_embedding_dim, 
                                        input_length=1, name='movie_embedding')(movie_id_input)
    # merging the embeddings
    if merging_method=='concatenate':
        merged = Concatenate()([user_embedded, movie_embedded])
    if merging_method=='dot_product':
        merged =Dot(name = 'dot_product', normalize = True, axes = 2)([user_embedded, movie_embedded])
    if merging_method=='add':
        merged =Add()([user_embedded, movie_embedded])
    if merging_method=='substract':
        merged=Subtract()([user_embedded, movie_embedded])
    if merging_method=='multiply':
        merged=Multiply()([user_embedded, movie_embedded])
    if merging_method=='average':
        merged=Average()([user_embedded, movie_embedded])
    if recurrent == None:
        out = Flatten()(merged)
        for n_hidden in hidden_units:
            out = Dense(n_hidden, activation='relu')(out)
    else: 
        for n_hidden in hidden_units:
            out = LSTM(n_hidden, activation='relu')(merged)

    #Two outputs: our predicted rating and unix_timestamp
    out_movie = Dense(1, activation='linear', name='movies_pred')(out)
    out_time = Dense(1, activation='relu', name='time_pred')(out)
    model = Model(inputs = [user_id_input, movie_id_input],outputs = [out_movie, out_time])
    model.compile(optimizer = 'Adam',loss='MSE',metrics=['MAE'])
    return model

In [27]:
model=embedding_model_mult_out(hidden_units, user_embedding_dim, user_max_cat_value, movie_embedding_dim, movie_max_cat_value,merging_method='average', recurrent= None)
history=model.fit(x=[X_train.user_id, X_train.movie_index], y=[y_train.rating,y_train.norm], batch_size=500,epochs=10, verbose=2, validation_data=[[X_test.user_id, X_test.movie_index],[y_test.rating,y_test.norm]], callbacks=[es])

Train on 75627 samples, validate on 25209 samples
Epoch 1/10
 - 5s - loss: 3.6784 - movies_pred_loss: 3.5300 - time_pred_loss: 0.1351 - movies_pred_MAE: 1.3891 - time_pred_MAE: 0.2786 - val_loss: 0.8145 - val_movies_pred_loss: 0.7855 - val_time_pred_loss: 0.0285 - val_movies_pred_MAE: 0.6836 - val_time_pred_MAE: 0.1354
Epoch 2/10
 - 4s - loss: 0.7535 - movies_pred_loss: 0.7312 - time_pred_loss: 0.0229 - movies_pred_MAE: 0.6574 - time_pred_MAE: 0.1217 - val_loss: 0.7796 - val_movies_pred_loss: 0.7639 - val_time_pred_loss: 0.0149 - val_movies_pred_MAE: 0.6717 - val_time_pred_MAE: 0.0992
Epoch 3/10
 - 4s - loss: 0.6983 - movies_pred_loss: 0.6898 - time_pred_loss: 0.0084 - movies_pred_MAE: 0.6363 - time_pred_MAE: 0.0716 - val_loss: 0.7699 - val_movies_pred_loss: 0.7645 - val_time_pred_loss: 0.0046 - val_movies_pred_MAE: 0.6707 - val_time_pred_MAE: 0.0518
Epoch 4/10
 - 4s - loss: 0.6763 - movies_pred_loss: 0.6739 - time_pred_loss: 0.0032 - movies_pred_MAE: 0.6268 - time_pred_MAE: 0.0423 - v

In [28]:
history.history['movies_pred_MAE'][9], history.history['val_movies_pred_MAE'][9], history.history['time_pred_MAE'][9], history.history['val_time_pred_MAE'][9]

(0.59185225, 0.6603676676750183, 0.025341213, 0.02654862217605114)

In [29]:
uid=1
movies_test=X_test.movie_index[X_test.user_id==uid]
predictions=pd.DataFrame(movies_test.values)
predictions.columns=['movie_index']
pred_movies=model.predict([[uid]*len(movies_test),movies_test.index]) # returns a list with 2 columns
predictions['rating_pred']=pred_movies[0]
predictions['time_pred']=pred_movies[1]

In [30]:
predictions.head()

Unnamed: 0,movie_index,rating_pred,time_pred
0,226,4.426295,0.611767
1,149,4.337087,0.619911
2,87,4.556562,0.618144
3,35,4.096316,0.607888
4,134,4.401696,0.60582


In [44]:
top_5=pd.DataFrame()
top_5['pred']=predictions.nlargest(5,'rating_pred').movie_index.values
top_5

Unnamed: 0,pred
0,70
1,147
2,114
3,219
4,75


In [45]:
# recurrent
model=embedding_model_mult_out(hidden_units, user_embedding_dim, user_max_cat_value, movie_embedding_dim, movie_max_cat_value,merging_method='average', recurrent= 1)
history=model.fit(x=[X_train.user_id, X_train.movie_index], y=[y_train.rating,y_train.norm], batch_size=500,epochs=10, verbose=2, validation_data=[[X_test.user_id, X_test.movie_index],[y_test.rating,y_test.norm]], callbacks=[es])

Train on 75627 samples, validate on 25209 samples
Epoch 1/10
 - 5s - loss: 7.7339 - movies_pred_loss: 7.4057 - time_pred_loss: 0.2959 - movies_pred_MAE: 2.3357 - time_pred_MAE: 0.4648 - val_loss: 1.1648 - val_movies_pred_loss: 1.1191 - val_time_pred_loss: 0.0446 - val_movies_pred_MAE: 0.8333 - val_time_pred_MAE: 0.1685
Epoch 2/10
 - 4s - loss: 0.9250 - movies_pred_loss: 0.8883 - time_pred_loss: 0.0367 - movies_pred_MAE: 0.7327 - time_pred_MAE: 0.1538 - val_loss: 0.8749 - val_movies_pred_loss: 0.8390 - val_time_pred_loss: 0.0350 - val_movies_pred_MAE: 0.7079 - val_time_pred_MAE: 0.1496
Epoch 3/10
 - 5s - loss: 0.7692 - movies_pred_loss: 0.7375 - time_pred_loss: 0.0326 - movies_pred_MAE: 0.6611 - time_pred_MAE: 0.1446 - val_loss: 0.8402 - val_movies_pred_loss: 0.8060 - val_time_pred_loss: 0.0334 - val_movies_pred_MAE: 0.6917 - val_time_pred_MAE: 0.1455
Epoch 4/10
 - 4s - loss: 0.7270 - movies_pred_loss: 0.6962 - time_pred_loss: 0.0308 - movies_pred_MAE: 0.6404 - time_pred_MAE: 0.1403 - v

In [46]:
history.history['movies_pred_MAE'][9], history.history['val_movies_pred_MAE'][9], history.history['time_pred_MAE'][9], history.history['val_time_pred_MAE'][9]
# to compate without lstm 
# (0.59185225, 0.6603676676750183, 0.025341213, 0.02654862217605114)
# without lstm better!

(0.61471206, 0.6896655559539795, 0.029456278, 0.03006492368876934)