In [39]:
# training a model using all possible features
# questions arises if we should encode before or after splitting the data
# after some research on the internet, most recommend doing the encoding on the training data, and 
# repeating the steps on for the test data
# most exercises on kaggle did the encoding on the whole dataset, then split it
# I think there is a possibility of data leackage when encoding on the whole data set!
# usind data preperation as in the pytorch example

In [41]:
# reading the data ml-latest-small
import pandas as pd
import zipfile
import numpy as np 
zf = zipfile.ZipFile('/home/elena/Downloads/ml-latest-small.zip')
# reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv(zf.open('ml-latest-small/ratings.csv'), names=r_cols)
m_cols=['movie_id', 'title', 'genre']
movies = pd.read_csv(zf.open('ml-latest-small/movies.csv'), names=m_cols)
# merging ratings and movies
data=pd.merge(ratings,movies,on='movie_id')

In [42]:
zz = zipfile.ZipFile('/home/elena/Downloads/ml-100k.zip')
# reading users file:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv(zz.open('ml-100k/u.user'), sep='|', names=u_cols,encoding='latin-1')
data=pd.merge(users,data, on='user_id')
data.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code,movie_id,rating,unix_timestamp,title,genre
0,1,24,M,technician,85711,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,24,M,technician,85711,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,24,M,technician,85711,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,24,M,technician,85711,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,24,M,technician,85711,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [43]:
cat_features=['sex','occupation','zip_code','genre' ]
num_features=['user_id','age','movie_id','rating','unix_timestamp','movie_index']
data.dtypes
# we can remove title when training the model, since it will not effect the outcome; it's just a name 
# assigned to each movie_id

user_id             int64
age                 int64
sex                object
occupation         object
zip_code           object
movie_id            int64
rating            float64
unix_timestamp      int64
title              object
genre              object
dtype: object

In [44]:
unique_movies=data.movie_id.unique()
unique_movies.min(),unique_movies.max(), len(unique_movies)
# movie_id's are numbers, but they have a bigger range then the actual amount of unique movies!
# lets start with indexing the movie_id

(1, 193609, 9724)

In [45]:
unique_movies = data.movie_id.unique() # returns a np array
movie_to_index = {old: new for new, old in enumerate(unique_movies)} # indexing movie_id, tart at 0
index_to_movie = {idx: movie for movie, idx in movie_to_index.items()}
new_movies = data.movie_id.map(movie_to_index) # replaces movie_id with coresp. index
data['movie_index']=new_movies
data.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code,movie_id,rating,unix_timestamp,title,genre,movie_index
0,1,24,M,technician,85711,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0
1,1,24,M,technician,85711,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance,1
2,1,24,M,technician,85711,6,4.0,964982224,Heat (1995),Action|Crime|Thriller,2
3,1,24,M,technician,85711,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,3
4,1,24,M,technician,85711,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,4


In [46]:
# to be sure that we have done it right, lets compare
compare_data=data[['user_id', 'movie_id', 'movie_index','rating']]
compare_data['old_movie_id']=compare_data.movie_index.map(index_to_movie)
compare_data.tail() # looks good

Unnamed: 0,user_id,movie_id,movie_index,rating,old_movie_id
100831,610,160341,9719,2.5,160341
100832,610,160527,9720,4.5,160527
100833,610,160836,9721,3.0,160836
100834,610,163937,9722,3.5,163937
100835,610,163981,9723,3.5,163981


In [49]:
data.user_id.unique().shape[0], data.user_id.min(), data.user_id.max(), 
# we have as many users as we have id's for them

(610, 1, 610)

In [50]:
data.occupation.unique().shape[0], data.zip_code.unique().shape[0], data.genre.unique().shape[0]

(21, 541, 951)

In [51]:
# we have several categorical features; 'sex','occupation','zip_code','genre'
# age is the only numerical feature where the number has a 'meaning'
# there a 2 ways how to handle cat. features: encoding (different ways to do so) and embeddings
#  first approach will be to use embeddings
# note: 1 embedding layer is required for each categorical feature, and the embedding expects the 
# categories to be ordinal encoded, although no relationship between the categories is assumed 
# (statement from kaggle)

In [52]:
# splitting into train and test data
# data was split and safed! 
train=pd.read_pickle('/home/elena/Downloads/traindata.pkl')
test=pd.read_pickle('/home/elena/Downloads/testdata.pkl')

In [58]:
# we need to add movie_index to train and test data
train['movie_index']=train.movie_id.map(movie_to_index)
test['movie_index']=test.movie_id.map(movie_to_index)

In [59]:
train.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code,movie_id,rating,unix_timestamp,title,genre,movie_index
97717,606,28,M,programmer,63044,3462,4.0,1171501099,Modern Times (1936),Comedy|Drama|Romance,1185
100124,610,22,M,student,21227,8914,4.0,1493845360,Primer (2004),Drama|Sci-Fi,2266
25952,180,22,F,administrator,60202,1196,4.0,1270237862,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi,68
25871,178,26,M,other,49512,2231,4.5,1163673637,Rounders (1998),Drama,2183
97255,605,33,M,engineer,33716,1588,4.0,1277094877,George of the Jungle (1997),Children|Comedy,1495


In [60]:
movie_to_index[3462]

1185

In [62]:
y_train=train.rating
y_test=test.rating
X_train=train.drop('rating',axis=1)
X_test=test.drop('rating', axis=1)

In [64]:
X_train.shape

(75627, 10)

In [65]:
# Encoding and getting data ready
from sklearn.preprocessing import LabelEncoder
# 1. approach: we fit on the training data, and transform the encoding to train and test datat 
# we got an error, because e.g. we have genre which are in test data but not in train!
# thus, we use encoding seperately
# Make copy to avoid changing original data 
X_train_label = X_train.copy()
X_test_label = X_test.copy()

cat_features=['unix_timestamp','sex','occupation','zip_code','genre']
# Apply label encoder to each column with categorical data
label_encoder = LabelEncoder()
for col in cat_features:
    X_train_label[col+'_label'] = label_encoder.fit_transform(X_train[col])
    X_test_label[col+'_label'] = label_encoder.fit_transform(X_test[col])

In [66]:
X_train_label.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code,movie_id,unix_timestamp,title,genre,movie_index,unix_timestamp_label,sex_label,occupation_label,zip_code_label,genre_label
97717,606,28,M,programmer,63044,3462,1171501099,Modern Times (1936),Comedy|Drama|Romance,1185,25146,1,14,342,661
100124,610,22,M,student,21227,8914,1493845360,Primer (2004),Drama|Sci-Fi,2266,56337,1,18,138,842
25952,180,22,F,administrator,60202,1196,1270237862,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi,68,36514,0,0,325,120
25871,178,26,M,other,49512,2231,1163673637,Rounders (1998),Drama,2183,23751,1,13,257,779
97255,605,33,M,engineer,33716,1588,1277094877,George of the Jungle (1997),Children|Comedy,1495,36920,1,4,198,574


In [67]:
col_embedding=['user_id', 'movie_index', 'unix_timestamp_label', 'sex_label', 'occupation_label', 'zip_code_label', 'genre_label']
# I left the feature age out, not sure how to handle it
X_train_emb=X_train_label[col_embedding]
X_test_emb=X_test_label[col_embedding]

In [68]:
# Implementation 
from keras.layers import Input, Embedding, Concatenate, Flatten, Dense, Dot, Add, Multiply, Subtract, Average, Reshape
from keras.models import Model
from keras.callbacks import EarlyStopping

In [69]:
# all features as embedding inputs

In [70]:
X_train_label.sex_label.max(), X_test_label.sex_label.max(),
X_train_label.occupation_label.max(), X_test_label.occupation_label.max(),
X_train_label.zip_code_label.max(), X_test_label.zip_code_label.max(),
X_train_label.genre_label.max(), X_test_label.genre_label.max() # difference !

(915, 761)

In [71]:
def embedding_model(hidden_units, embedding_dim, merging_method):
    # Each instance will consist of 7 inputs:
    # col_embedding=['user_id', 'movie_index', 'unix_timestamp', 'sex_label', 'occupation_label', 'zip_code_label', 'genre_label']

    user_id_input = Input(shape=(1,), name='user_id')
    movie_index_input = Input(shape=(1,), name='movie_index')
    unix_timestamp_input = Input(shape=(1,), name='unix_timestamp')
    sex_input = Input(shape=(1,), name='sex')
    occupation_input=Input(shape=(1,), name='occupation')
    zip_code_input=Input(shape=(1,), name='zip_code')
    genre_input=Input(shape=(1,), name='genre')

    # Embeddings
    user_embedded = Embedding(data.movie_id.max()+1, embedding_dim, 
                                       input_length=1, name='user_embedding')(user_id_input)
    movie_embedded = Embedding(data.movie_index.max()+1, embedding_dim, 
                                        input_length=1, name='movie_embedding')(movie_index_input)
    unix_timestamp_embedded = Embedding(X_train_label.unix_timestamp_label.max()+1, embedding_dim, 
                                        input_length=1, name='unix_timestamp_embedding')(unix_timestamp_input)
    sex_embedded = Embedding(X_train_label.sex_label.max()+1, embedding_dim, 
                                        input_length=1, name='sex_embedding')(sex_input)
    occupation_embedded= Embedding(X_train_label.occupation_label.max()+1, embedding_dim, 
                                        input_length=1, name='occupation_embedding')(occupation_input)
    zipe_code_embedded = Embedding(X_train_label.zip_code_label.max()+1, embedding_dim, 
                                        input_length=1, name='zip_code_embedding')(zip_code_input)
    genre_embedded = Embedding(X_train_label.genre_label.max()+1, embedding_dim, 
                                        input_length=1, name='genre_embedding')(genre_input)
    # merging the embeddings
    embeddings_cols=[user_embedded, movie_embedded,  unix_timestamp_embedded, zipe_code_embedded, occupation_embedded, genre_embedded]

    if merging_method=='concatenate':
        merged = Concatenate()(embeddings_cols)
    if merging_method=='dot_product':
        merged =Dot(name = 'dot_product', normalize = True, axes = 2)(embeddings_cols)
    if merging_method=='add':
        merged =Add()(embeddings_cols)
    if merging_method=='substract':
        merged=Subtract()(embeddings_cols)
    if merging_method=='multiply':
        merged=Multiply()(embeddings_cols)
    if merging_method=='average':
        merged=Average()(embeddings_cols)
    out = Flatten()(merged)

    # Add one or more hidden layers
    for n_hidden in hidden_units:
        out = Dense(n_hidden, activation='relu')(out)

    # A single output: our predicted rating
    out = Dense(1, activation='linear', name='prediction')(out)
    model=Model(inputs = [user_id_input, movie_index_input, unix_timestamp_input, sex_input, occupation_input, zip_code_input, genre_input],outputs = out)
    model.compile(optimizer = 'Adam',loss='MSE',metrics=['MAE'])
    return model

In [72]:
hidden_units = (100,50) #same as in pytorch model
embedding_dim = 50 #same as in pytorch model
es=EarlyStopping(monitor='val_MAE', min_delta=0, patience=0, verbose=0, mode='min', baseline=None, restore_best_weights=False)
# col_embedding=['user_id', 'movie_index', 'unix_timestamp', 'sex_label', 'occupation_label', #'zip_code_label', 'genre_label']

inputs=[X_train_label.user_id,X_train_label.movie_index,X_train_label.unix_timestamp_label,X_train_label.sex_label, X_train_label.occupation_label, X_train_label.zip_code_label, X_train_label.genre_label]
inputs_test=[X_test_label.user_id,X_test_label.movie_index,X_test_label.unix_timestamp_label,X_test_label.sex_label, X_test_label.occupation_label, X_test_label.zip_code_label, X_test_label.genre_label]

In [73]:
model_concatenate=embedding_model(hidden_units,embedding_dim, merging_method='concatenate')
#model_concatenate.summary(line_length=88)

Model: "model_3"
________________________________________________________________________________________
Layer (type)                 Output Shape       Param #   Connected to                  
user_id (InputLayer)         (None, 1)          0                                       
________________________________________________________________________________________
movie_index (InputLayer)     (None, 1)          0                                       
________________________________________________________________________________________
unix_timestamp (InputLayer)  (None, 1)          0                                       
________________________________________________________________________________________
zip_code (InputLayer)        (None, 1)          0                                       
________________________________________________________________________________________
occupation (InputLayer)      (None, 1)          0                                       
____

In [74]:
trained_model_concatenate= model_concatenate.fit(x=inputs, y=y_train, batch_size=500,epochs=10, verbose=2, validation_data=[inputs_test,y_test], callbacks=[es])

Train on 75627 samples, validate on 25209 samples
Epoch 1/10
 - 59s - loss: 2.7993 - MAE: 1.2041 - val_loss: 0.9161 - val_MAE: 0.7632
Epoch 2/10
 - 58s - loss: 0.5755 - MAE: 0.5678 - val_loss: 1.2143 - val_MAE: 0.8771


In [77]:
model_add=embedding_model(hidden_units,embedding_dim, merging_method='add')
trained_model_add= model_add.fit(x=inputs, y=y_train, batch_size=500,epochs=10, verbose=2, validation_data=[inputs_test,y_test], callbacks=[es])

Train on 75627 samples, validate on 25209 samples
Epoch 1/10
 - 57s - loss: 2.5122 - MAE: 1.1424 - val_loss: 0.9363 - val_MAE: 0.7647
Epoch 2/10
 - 57s - loss: 0.5650 - MAE: 0.5582 - val_loss: 1.0178 - val_MAE: 0.7992


In [80]:
model_multiply=embedding_model(hidden_units,embedding_dim, merging_method='multiply')
trained_model_multiply= model_multiply.fit(x=inputs, y=y_train, batch_size=500,epochs=10, verbose=2, validation_data=[inputs_test,y_test], callbacks=[es])

Train on 75627 samples, validate on 25209 samples
Epoch 1/10
 - 59s - loss: 5.3430 - MAE: 1.8775 - val_loss: 1.0830 - val_MAE: 0.8286
Epoch 2/10
 - 60s - loss: 1.0885 - MAE: 0.8292 - val_loss: 1.0823 - val_MAE: 0.8254
Epoch 3/10
 - 59s - loss: 1.0887 - MAE: 0.8293 - val_loss: 1.0825 - val_MAE: 0.8262
