In [1]:
import pandas as pd
import zipfile
import numpy as np 

In [2]:
#  loading data
csv_ratings='ml-latest-small/ratings.csv'
csv_movies='ml-latest-small/movies.csv'
def get_data_ratings(csv_ratings,csv_movies):
    zf = zipfile.ZipFile('/home/elena/Downloads/ml-latest-small.zip')
    # reading ratings file:
    r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
    ratings = pd.read_csv(zf.open(csv_ratings), names=r_cols)
    m_cols=['movie_id', 'title', 'genre']
    movies = pd.read_csv(zf.open(csv_movies), names=m_cols)
    # merging ratings and movies
    # ratings=pd.merge(ratings,movies,on='movie_id')
    return pd.merge(ratings,movies,on='movie_id')

In [3]:
ratings=get_data_ratings(csv_ratings,csv_movies)

In [4]:
# loading train / test data
def train_test_data(ratings):
    unique_movies = ratings.movie_id.unique() # returns a np array
    movie_to_index = {old: new for new, old in enumerate(unique_movies)} # indexing movie_id, tart at 0
    index_to_movie = {idx: movie for movie, idx in movie_to_index.items()}
    new_movies = ratings.movie_id.map(movie_to_index) # replaces movie_id with coresp. index
    ratings['movie_index']=new_movies

    train=pd.read_pickle('/home/elena/Downloads/traindata.pkl')
    test=pd.read_pickle('/home/elena/Downloads/testdata.pkl')
    train['movie_index']=train.movie_id.map(movie_to_index)
    test['movie_index']=test.movie_id.map(movie_to_index)
    return (train,test)

In [5]:
train, test = train_test_data(ratings)

In [6]:
y_train=train.rating
y_test=test.rating
X_train=train.drop('rating', axis=1)
X_test=test.drop('rating', axis=1)

In [7]:
# Implementation 
from keras.layers import Input, Embedding, Concatenate, Flatten, Dense, Dot, Add, Multiply, Subtract, Average
from keras.models import Model
from keras.callbacks import EarlyStopping

Using TensorFlow backend.


In [8]:
def embedding_model(hidden_units, user_embedding_dim, user_max_cat_value, movie_embedding_dim, movie_max_cat_value,merging_method):
    # Each instance will consist of two inputs: a single user id, and a single movie id
    user_id_input = Input(shape=(1,), name='user_id')
    movie_id_input = Input(shape=(1,), name='movie_id')
    # Embeddings
    user_embedded = Embedding(user_max_cat_value+1, user_embedding_dim, 
                                       input_length=1, name='user_embedding')(user_id_input)
    movie_embedded = Embedding(movie_max_cat_value+1, movie_embedding_dim, 
                                        input_length=1, name='movie_embedding')(movie_id_input)
    # merging the embeddings
    if merging_method=='concatenate':
        merged = Concatenate()([user_embedded, movie_embedded])
    if merging_method=='dot_product':
        merged =Dot(name = 'dot_product', normalize = True, axes = 2)([user_embedded, movie_embedded])
    if merging_method=='add':
        merged =Add()([user_embedded, movie_embedded])
    if merging_method=='substract':
        merged=Subtract()([user_embedded, movie_embedded])
    if merging_method=='multiply':
        merged=Multiply()([user_embedded, movie_embedded])
    if merging_method=='average':
        merged=Average()([user_embedded, movie_embedded])
    out = Flatten()(merged)

    # Add one or more hidden layers
    for n_hidden in hidden_units:
        out = Dense(n_hidden, activation='relu')(out)

    # A single output: our predicted rating
    out = Dense(1, activation='linear', name='prediction')(out)
    model = Model(inputs = [user_id_input, movie_id_input],outputs = out)
    model.compile(optimizer = 'Adam',loss='MSE',metrics=['MAE'])
    return model

In [9]:
# parameters for the models used
hidden_units = (100,50) #same as in pytorch model
movie_embedding_dim = 50 #same as in pytorch model
user_embedding_dim = 50  #same as in pytorch model
user_max_cat_value = ratings.user_id.max()
movie_max_cat_value=max(X_train.movie_index.max(), X_test.movie_index.max())
es=EarlyStopping(monitor='val_MAE', min_delta=0, patience=0, verbose=0, mode='min', baseline=None, restore_best_weights=False)

In [10]:
mergemethod=['concatenate','dot_product','add','substract', 'multiply','average']
summary=pd.DataFrame(columns=['merge','val_MAE', 'MAE','epoch','val_loss', 'loss'])
merge,epoch,val_MAE,MAE,loss,val_loss=[],[],[],[],[],[]
# for prediction
uid=1
movies_test=X_test.movie_index[X_test.user_id==uid]
predictions=pd.DataFrame(movies_test.values)
predictions.columns=['movie_index']
# looping through the merging methods
for m in mergemethod:
    model=embedding_model(hidden_units, user_embedding_dim, user_max_cat_value, movie_embedding_dim, movie_max_cat_value,merging_method=m)
    history=model.fit(x=[X_train.user_id, X_train.movie_index], y=y_train, batch_size=500,epochs=10, verbose=0, validation_data=[[X_test.user_id, X_test.movie_index],y_test], callbacks=[es])
    # predicting for user uid
    pred=model.predict([[uid]*len(movies_test),movies_test.index])
    predictions[m]=pred
    # collecting MAE's and loss
    merge.append(m)
    n=len(history.epoch)
    epoch.append(n)
    val_MAE.append(history.history['val_MAE'][n-1])
    MAE.append(history.history['MAE'][n-1])
    loss.append(history.history['loss'][n-1])
    val_loss.append(history.history['val_loss'][n-1])
summary['merge']=merge 
summary['val_MAE']=val_MAE 
summary['epoch']=epoch
summary['MAE']=MAE 
summary['loss']=loss 
summary['val_loss']=val_loss

In [11]:
summary # best models are add, concat and average

Unnamed: 0,merge,val_MAE,MAE,epoch,val_loss,loss
0,concatenate,0.675439,0.637787,3,0.772846,0.693698
1,dot_product,0.787453,0.456959,4,1.012548,0.388622
2,add,0.673487,0.613035,6,0.770759,0.650033
3,substract,0.676568,0.627759,4,0.774539,0.675542
4,multiply,0.726129,0.535156,3,0.872359,0.49763
5,average,0.676141,0.635752,3,0.767803,0.689311


In [114]:
# summary first run - best models are concat and average

Unnamed: 0,merge,val_MAE,MAE,epoch,val_loss,loss
0,concatenate,0.663808,0.587815,9,0.762902,0.60277
1,dot_product,0.837408,0.8194,2,1.091396,1.055124
2,add,0.678036,0.637935,3,0.775935,0.692228
3,substract,0.675556,0.619166,5,0.767822,0.659461
4,multiply,0.735127,0.525711,3,0.895239,0.478696
5,average,0.673198,0.636541,3,0.767651,0.690644


In [161]:
# top 5 
# 2 run
top_5=pd.DataFrame()
for n in mergemethod:
    top_5[n]=predictions.nlargest(5,n).movie_index.values
top_5
# model concat and average recommend the same movies, the order is just different!
# models sub and add recommend 4 out of 5 movies from model concat and average
# movie 147 is recommended by all movies
# movie 70,103,2019,114 is recommended by 4 out of 6 models

Unnamed: 0,concatenate,dot_product,add,substract,multiply,average
0,147,54,70,70,147,70
1,114,219,114,103,75,114
2,103,195,147,147,134,219
3,70,199,87,127,97,147
4,219,87,103,114,219,103


In [12]:
top_5=pd.DataFrame()
for n in mergemethod:
    top_5[n]=predictions.nlargest(5,n).movie_index.values
top_5
# model concat and average recommend 3 same movies, model concat and add also
# model add and average recommend 4 same movies, add and substract also
# 147 is recommended by all models
# models sub and add recommend 4 out of 5 movies from model concat and average
# movie 147 is recommended by all movies
# movie 70,103,2019,114 is recommended by 4 out of 6 models

Unnamed: 0,concatenate,dot_product,add,substract,multiply,average
0,70,182,70,147,219,147
1,147,199,147,70,226,70
2,103,70,114,114,75,114
3,219,75,222,222,179,202
4,114,147,87,219,147,87


In [162]:
# model concat and average recommend the same movies, the order is just different!
# models sub and add recommend 4 out of 5 movies from model concat and average
# movie 147 is recommended by all movies
# movie 70,103,2019,114 is recommended by 4 out of 6 models

In [163]:
# here we repeat the above steps but prepare the data for the model by using an Integer Encoding (from kaggle) instead of indexing the movie_id

In [13]:
from sklearn.preprocessing import LabelEncoder
# loading train / test data
def train_test_data_enc(ratings):
    train=pd.read_pickle('/home/elena/Downloads/traindata.pkl')
    test=pd.read_pickle('/home/elena/Downloads/testdata.pkl')
    label_encoder = LabelEncoder()
    label_encoder.fit(ratings['movie_id'])
    train['movie_enc'] = label_encoder.transform(train['movie_id'])
    test['movie_enc'] = label_encoder.transform(test['movie_id'])
    return (train,test)

In [55]:
train_enc, test_enc=train_test_data_enc(ratings)
y_train_enc=train_enc.rating
y_test_enc=test_enc.rating
X_train_enc=train_enc.drop('rating',axis=1)
X_test_enc=test_enc.drop('rating',axis=1)
movie_max_cat_value_enc=max(X_train_enc.movie_enc.max(), X_test_enc.movie_enc.max())

In [15]:
mergemethod=['concatenate','dot_product','add','substract', 'multiply','average']
summary_enc=pd.DataFrame(columns=['merge','val_MAE', 'MAE','epoch','val_loss', 'loss'])
merge,epoch,val_MAE,MAE,loss,val_loss=[],[],[],[],[],[]
# for prediction
uid=1
movies_test=X_test_enc.movie_enc[X_test_enc.user_id==uid]
predictions=pd.DataFrame(movies_test.values)
predictions.columns=['movie_index']
# looping through the merging methods
for m in mergemethod:
    model=embedding_model(hidden_units, user_embedding_dim, user_max_cat_value, movie_embedding_dim, movie_max_cat_value_enc,merging_method=m)
    history=model.fit(x=[X_train_enc.user_id, X_train_enc.movie_enc], y=y_train_enc, batch_size=500,epochs=10, verbose=0, validation_data=[[X_test_enc.user_id, X_test_enc.movie_enc],y_test_enc], callbacks=[es])
    # predicting for user uid
    pred=model.predict([[uid]*len(movies_test),movies_test.index])
    predictions[m]=pred
    # collecting MAE's and loss
    merge.append(m)
    n=len(history.epoch)
    epoch.append(n)
    val_MAE.append(history.history['val_MAE'][n-1])
    MAE.append(history.history['MAE'][n-1])
    loss.append(history.history['loss'][n-1])
    val_loss.append(history.history['val_loss'][n-1])
summary_enc['merge']=merge 
summary_enc['val_MAE']=val_MAE 
summary_enc['epoch']=epoch
summary_enc['MAE']=MAE 
summary_enc['loss']=loss 
summary_enc['val_loss']=val_loss

In [178]:
# 1. run
pd.merge(summary[['merge','val_MAE','MAE']],summary_enc[['merge','val_MAE','MAE']],on='merge')

Unnamed: 0,merge,val_MAE_x,MAE_x,val_MAE_y,MAE_y
0,concatenate,0.673352,0.626355,0.673125,0.630098
1,dot_product,0.834497,0.825047,0.833816,0.823366
2,add,0.683223,0.628528,0.676729,0.616632
3,substract,0.67629,0.627173,0.677572,0.627651
4,multiply,0.731964,0.516169,0.732915,0.523372
5,average,0.673574,0.617534,0.674682,0.635786


In [16]:
pd.merge(summary[['merge','val_MAE','MAE']],summary_enc[['merge','val_MAE','MAE']],on='merge')
# model with indexing stays a little bit better

Unnamed: 0,merge,val_MAE_x,MAE_x,val_MAE_y,MAE_y
0,concatenate,0.675439,0.637787,0.678883,0.638073
1,dot_product,0.787453,0.456959,0.801555,0.456958
2,add,0.673487,0.613035,0.678191,0.636344
3,substract,0.676568,0.627759,0.678331,0.626237
4,multiply,0.726129,0.535156,0.736653,0.5107
5,average,0.676141,0.635752,0.677634,0.621122


In [29]:
# Let's now predict movies and time!

In [126]:
train, test = train_test_data(ratings)
y_train=train[['rating','unix_timestamp']]
y_test=test[['rating','unix_timestamp']]
X_train=train.drop(['rating','unix_timestamp'], axis=1)
X_test=test.drop(['rating','unix_timestamp'], axis=1)

In [127]:
# normalizing unix_timestamp
max_train=y_train.unix_timestamp.max()
y_train['norm']=[float(i)/max_train for i in y_train.unix_timestamp]
max_test=y_test.unix_timestamp.max()
y_test['norm']=[float(i)/max_test for i in y_test.unix_timestamp]

In [128]:
min_train=y_train.unix_timestamp.min()
y_train['norm_min']=[float(i)/max_train for i in y_train.unix_timestamp]
min_test=y_test.unix_timestamp.min()
y_test['norm_min']=[float(i)/max_test for i in y_test.unix_timestamp]

In [129]:
y_train.norm.max(),y_train.norm.min(), y_test.norm.max(),y_test.norm.min()

(1.0, 0.5385275978108841, 1.0, 0.5385128227888003)

In [136]:
# parameters for the models used
hidden_units = (100,50) #same as in pytorch model
movie_embedding_dim = 50 #same as in pytorch model
user_embedding_dim = 50  #same as in pytorch model
user_max_cat_value = ratings.user_id.max()
movie_max_cat_value=X_test.movie_index.max()
es=EarlyStopping(monitor='val_MAE', min_delta=0, patience=0, verbose=0, mode='min', baseline=None, restore_best_weights=False)
# the model 
#
#
def embedding_model_mult_out(hidden_units, user_embedding_dim, user_max_cat_value, movie_embedding_dim, movie_max_cat_value,merging_method):
    # Each instance will consist of two inputs: a single user id, and a single movie id
    user_id_input = Input(shape=(1,), name='user_id')
    movie_id_input = Input(shape=(1,), name='movie_id')
    # Embeddings
    user_embedded = Embedding(user_max_cat_value+1, user_embedding_dim, 
                                       input_length=1, name='user_embedding')(user_id_input)
    movie_embedded = Embedding(movie_max_cat_value+1, movie_embedding_dim, 
                                        input_length=1, name='movie_embedding')(movie_id_input)
    # merging the embeddings
    if merging_method=='concatenate':
        merged = Concatenate()([user_embedded, movie_embedded])
    if merging_method=='dot_product':
        merged =Dot(name = 'dot_product', normalize = True, axes = 2)([user_embedded, movie_embedded])
    if merging_method=='add':
        merged =Add()([user_embedded, movie_embedded])
    if merging_method=='substract':
        merged=Subtract()([user_embedded, movie_embedded])
    if merging_method=='multiply':
        merged=Multiply()([user_embedded, movie_embedded])
    if merging_method=='average':
        merged=Average()([user_embedded, movie_embedded])
    out = Flatten()(merged)

    # Add one or more hidden layers
    for n_hidden in hidden_units:
        out = Dense(n_hidden, activation='relu')(out)

    #Two outputs: our predicted rating and unix_timestamp
    out_movie = Dense(1, activation='linear', name='movies_pred')(out)
    out_time = Dense(1, activation='relu', name='time_pred')(out)
    model = Model(inputs = [user_id_input, movie_id_input],outputs = [out_movie, out_time])
    model.compile(optimizer = 'Adam',loss='MSE',metrics=['MAE'])
    return model

In [149]:
model=embedding_model_mult_out(hidden_units, user_embedding_dim, user_max_cat_value, movie_embedding_dim, movie_max_cat_value_enc,merging_method='average')

In [150]:
history=model.fit(x=[X_train.user_id, X_train.movie_index], y=[y_train.rating,y_train.norm], batch_size=500,epochs=10, verbose=2, validation_data=[[X_test.user_id, X_test.movie_index],[y_test.rating,y_test.norm]], callbacks=[es])

Train on 75627 samples, validate on 25209 samples
Epoch 1/10
 - 4s - loss: 3.3882 - movies_pred_loss: 3.2255 - time_pred_loss: 0.1500 - movies_pred_MAE: 1.3145 - time_pred_MAE: 0.2927 - val_loss: 0.8155 - val_movies_pred_loss: 0.7845 - val_time_pred_loss: 0.0301 - val_movies_pred_MAE: 0.6857 - val_time_pred_MAE: 0.1394
Epoch 2/10
 - 3s - loss: 0.7549 - movies_pred_loss: 0.7284 - time_pred_loss: 0.0261 - movies_pred_MAE: 0.6562 - time_pred_MAE: 0.1300 - val_loss: 0.7832 - val_movies_pred_loss: 0.7630 - val_time_pred_loss: 0.0195 - val_movies_pred_MAE: 0.6738 - val_time_pred_MAE: 0.1142
Epoch 3/10
 - 3s - loss: 0.7013 - movies_pred_loss: 0.6895 - time_pred_loss: 0.0118 - movies_pred_MAE: 0.6360 - time_pred_MAE: 0.0863 - val_loss: 0.7706 - val_movies_pred_loss: 0.7639 - val_time_pred_loss: 0.0063 - val_movies_pred_MAE: 0.6709 - val_time_pred_MAE: 0.0624
Epoch 4/10
 - 3s - loss: 0.6773 - movies_pred_loss: 0.6734 - time_pred_loss: 0.0044 - movies_pred_MAE: 0.6262 - time_pred_MAE: 0.0511 - v

In [154]:
history.history['movies_pred_MAE'][9], history.history['val_movies_pred_MAE'][9], history.history['time_pred_MAE'][9], history.history['val_time_pred_MAE'][9]

(0.5955444, 0.6763771772384644, 0.032870516, 0.03596564009785652)

In [161]:
uid=1
movies_test=X_test.movie_index[X_test.user_id==uid]
predictions=pd.DataFrame(movies_test.values)
predictions.columns=['movie_index']

In [174]:
pred_movies=model.predict([[uid]*len(movies_test),movies_test.index]) # returns a list with 2 columns
predictions['rating_pred']=pred_movies[0]
predictions['time_pred']=pred_movies[1]

In [175]:
predictions.head()

Unnamed: 0,movie_index,rating_pred,time_pred
0,226,4.399261,0.620912
1,149,4.033945,0.622854
2,87,4.492454,0.592156
3,35,4.153248,0.556509
4,134,4.295627,0.617552


In [1]:
top_5=pd.DataFrame()
top_5['pred']=predictions.nlargest(5,'rating_pred').movie_index.values
top_5

NameError: name 'pd' is not defined