In [106]:
# training a model using all possible features
# questions arises if we should encode before or after splitting the data
# after some research on the internet, most recommend doing the encoding on the training data, and 
# repeating the steps on for the test data
# most exercises on kaggle did the encoding on the whole dataset, then split it
# I think there is a possibility of data leackage when encoding on the whole data set!
# BUT I get an error fitting the encoding on the training set, then  transforming the test set, since test set my include features not in train set! So I fitted and transformed the encoding on the whole data set
# usind data preperation as in the pytorch example

In [107]:
import pandas as pd
import zipfile
import numpy as np 

In [108]:
#  loading data
csv_ratings='ml-latest-small/ratings.csv'
csv_movies='ml-latest-small/movies.csv'
def get_data_ratings(csv_ratings,csv_movies):
    zf = zipfile.ZipFile('/home/elena/Downloads/ml-latest-small.zip')
    # reading ratings file:
    r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
    ratings = pd.read_csv(zf.open(csv_ratings), names=r_cols)
    m_cols=['movie_id', 'title', 'genre']
    movies = pd.read_csv(zf.open(csv_movies), names=m_cols)
    # merging ratings and movies
    data=pd.merge(ratings,movies,on='movie_id')
    zz = zipfile.ZipFile('/home/elena/Downloads/ml-100k.zip')
    # reading users file:
    u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
    users = pd.read_csv(zz.open('ml-100k/u.user'), sep='|', names=u_cols,encoding='latin-1')
    return pd.merge(users,data, on='user_id')

In [109]:
data=get_data_ratings(csv_ratings,csv_movies)

In [110]:
unique_movies=data.movie_id.unique()
unique_movies.min(),unique_movies.max(), len(unique_movies)
# movie_id's are numbers, but they have a bigger range then the actual amount of unique movies!
# lets start with indexing the movie_id

(1, 193609, 9724)

In [112]:
# loading train / test data
def train_test_data(ratings):
    unique_movies = ratings.movie_id.unique() # returns a np array
    movie_to_index = {old: new for new, old in enumerate(unique_movies)} # indexing movie_id, tart at 0
    index_to_movie = {idx: movie for movie, idx in movie_to_index.items()}
    new_movies = ratings.movie_id.map(movie_to_index) # replaces movie_id with coresp. index
    ratings['movie_index']=new_movies

    train=pd.read_pickle('/home/elena/Downloads/traindata.pkl')
    test=pd.read_pickle('/home/elena/Downloads/testdata.pkl')
    train['movie_index']=train.movie_id.map(movie_to_index)
    test['movie_index']=test.movie_id.map(movie_to_index)
    return (train,test)

In [113]:
train, test = train_test_data(data)

In [114]:
data.user_id.unique().shape[0], data.user_id.min(), data.user_id.max(), 
# we have as many users as we have id's for them

(610, 1, 610)

In [115]:
data.occupation.unique().shape[0], data.zip_code.unique().shape[0], data.genre.unique().shape[0]

(21, 541, 951)

In [116]:
# we have several categorical features; 'sex','occupation','zip_code','genre'
# age is the only numerical feature where the number has a 'meaning'
# there a 2 ways how to handle cat. features: encoding (different ways to do so) and embeddings
#  first approach will be to use embeddings
# note: 1 embedding layer is required for each categorical feature, and the embedding expects the 
# categories to be ordinal encoded, although no relationship between the categories is assumed 
# (statement from kaggle)

In [117]:
# to better compare the above models with the ones where only user and movies are used as inputs, we 
# should fit the encoding to the whole data and then transform it on train and test, since we did that # with the indexing of the movie_id

In [118]:
y_train=train.rating
y_test=test.rating
X_train=train.drop('rating',axis=1)
X_test=test.drop('rating', axis=1)

In [131]:
# Encoding and getting data ready
from sklearn.preprocessing import LabelEncoder
# 1. approach: we fit on the training data, and transform the encoding to train and test datat 
# we got an error, because e.g. we have genre which are in test data but not in train!
# thus, we use encoding seperately
# Make copy to avoid changing original data 
X_train_label = X_train.copy()
X_test_label = X_test.copy()

cat_features=['unix_timestamp','sex','occupation','zip_code','genre']
# Apply label encoder to each column with categorical data
label_encoder = LabelEncoder()
for col in cat_features:
    label_encoder.fit(data[col])
    X_train_label[col+'_label'] = label_encoder.transform(X_train[col])
    X_test_label[col+'_label'] = label_encoder.transform(X_test[col])

In [132]:
# Implementation 
from keras.layers import Input, Embedding, Concatenate, Flatten, Dense, Dot, Add, Multiply, Subtract, Average, Reshape
from keras.models import Model
from keras.callbacks import EarlyStopping

In [123]:
# all features as embedding inputs

In [139]:
X_train_label.unix_timestamp_label.max(),X_test_label.unix_timestamp_label.max()

(85041, 85042)

In [140]:
def embedding_model(hidden_units, embedding_dim, merging_method):
    # Each instance will consist of 7 inputs:
    # col_embedding=['user_id', 'movie_index', 'unix_timestamp', 'sex_label', 'occupation_label', 'zip_code_label', 'genre_label']

    user_id_input = Input(shape=(1,), name='user_id')
    movie_index_input = Input(shape=(1,), name='movie_index')
    unix_timestamp_input = Input(shape=(1,), name='unix_timestamp')
    sex_input = Input(shape=(1,), name='sex')
    occupation_input=Input(shape=(1,), name='occupation')
    zip_code_input=Input(shape=(1,), name='zip_code')
    genre_input=Input(shape=(1,), name='genre')

    # Embeddings
    user_embedded = Embedding(data.movie_id.max()+1, embedding_dim, 
                                       input_length=1, name='user_embedding')(user_id_input)
    movie_embedded = Embedding(data.movie_index.max()+1, embedding_dim, 
                                        input_length=1, name='movie_embedding')(movie_index_input)
    unix_timestamp_embedded = Embedding(X_test_label.unix_timestamp_label.max()+1, embedding_dim, 
                                        input_length=1, name='unix_timestamp_embedding')(unix_timestamp_input)
    sex_embedded = Embedding(X_train_label.sex_label.max()+1, embedding_dim, 
                                        input_length=1, name='sex_embedding')(sex_input)
    occupation_embedded= Embedding(X_train_label.occupation_label.max()+1, embedding_dim, 
                                        input_length=1, name='occupation_embedding')(occupation_input)
    zipe_code_embedded = Embedding(X_train_label.zip_code_label.max()+1, embedding_dim, 
                                        input_length=1, name='zip_code_embedding')(zip_code_input)
    genre_embedded = Embedding(X_train_label.genre_label.max()+1, embedding_dim, 
                                        input_length=1, name='genre_embedding')(genre_input)
    # merging the embeddings
    embeddings_cols=[user_embedded, movie_embedded,  unix_timestamp_embedded, zipe_code_embedded, occupation_embedded, genre_embedded]

    if merging_method=='concatenate':
        merged = Concatenate()(embeddings_cols)
    if merging_method=='dot_product':
        merged =Dot(name = 'dot_product', normalize = True, axes = 2)(embeddings_cols)
    if merging_method=='add':
        merged =Add()(embeddings_cols)
    if merging_method=='substract':
        merged=Subtract()(embeddings_cols)
    if merging_method=='multiply':
        merged=Multiply()(embeddings_cols)
    if merging_method=='average':
        merged=Average()(embeddings_cols)
    out = Flatten()(merged)

    # Add one or more hidden layers
    for n_hidden in hidden_units:
        out = Dense(n_hidden, activation='relu')(out)

    # A single output: our predicted rating
    out = Dense(1, activation='linear', name='prediction')(out)
    model=Model(inputs = [user_id_input, movie_index_input, unix_timestamp_input, sex_input, occupation_input, zip_code_input, genre_input],outputs = out)
    model.compile(optimizer = 'Adam',loss='MSE',metrics=['MAE'])
    return model

In [141]:
hidden_units = (100,50) #same as in pytorch model
embedding_dim = 50 #same as in pytorch model
es=EarlyStopping(monitor='val_MAE', min_delta=0, patience=0, verbose=0, mode='min', baseline=None, restore_best_weights=False)
# col_embedding=['user_id', 'movie_index', 'unix_timestamp', 'sex_label', 'occupation_label', #'zip_code_label', 'genre_label']

inputs=[X_train_label.user_id,X_train_label.movie_index,X_train_label.unix_timestamp_label,X_train_label.sex_label, X_train_label.occupation_label, X_train_label.zip_code_label, X_train_label.genre_label]
inputs_test=[X_test_label.user_id,X_test_label.movie_index,X_test_label.unix_timestamp_label,X_test_label.sex_label, X_test_label.occupation_label, X_test_label.zip_code_label, X_test_label.genre_label]

In [142]:
mergemethod=['concatenate', 'add', 'multiply', 'average']
# merging method dot product and substract are only possible for 2 inputs / embedding
summary=pd.DataFrame(columns=['merge','val_MAE', 'MAE','epoch','val_loss', 'loss'])
merge,epoch,val_MAE,MAE,loss,val_loss=[],[],[],[],[],[]
# looping through the merging methods
for m in mergemethod:
    model=embedding_model(hidden_units, embedding_dim, merging_method=m)
    history=model.fit(x=inputs, y=y_train, batch_size=500,epochs=10, verbose=0, validation_data=[inputs_test,y_test], callbacks=[es])
    # collecting MAE's and loss
    merge.append(m)
    n=len(history.epoch)
    epoch.append(n)
    val_MAE.append(history.history['val_MAE'][n-1])
    MAE.append(history.history['MAE'][n-1])
    loss.append(history.history['loss'][n-1])
    val_loss.append(history.history['val_loss'][n-1])
summary['merge']=merge 
summary['val_MAE']=val_MAE 
summary['epoch']=epoch
summary['MAE']=MAE 
summary['loss']=loss 
summary['val_loss']=val_loss

In [143]:
summary

Unnamed: 0,merge,val_MAE,MAE,epoch,val_loss,loss
0,concatenate,0.697425,0.568162,2,0.818559,0.578214
1,add,0.687128,0.557094,2,0.795325,0.561479
2,multiply,0.824976,0.830012,3,1.082283,1.08874
3,average,0.687514,0.474438,3,0.796303,0.441894


In [144]:
X_train_label.columns

Index(['user_id', 'age', 'sex', 'occupation', 'zip_code', 'movie_id',
       'unix_timestamp', 'title', 'genre', 'movie_index',
       'unix_timestamp_label', 'sex_label', 'occupation_label',
       'zip_code_label', 'genre_label'],
      dtype='object')

In [145]:
# feature selection
from sklearn.feature_selection import SelectKBest, f_classif

feature_cols = ['user_id', 'movie_index', 'unix_timestamp_label', 'sex_label', 'occupation_label','zip_code_label', 'genre_label']
selector = SelectKBest(f_classif, k=3)
X_new = selector.fit_transform(X_train_label[feature_cols], y_train)
X_new

array([[ 1185, 31422,    14],
       [ 2266, 72974,    18],
       [   68, 46607,     0],
       ...,
       [ 2290, 33548,    15],
       [  708,  2135,     6],
       [ 1312, 59070,    18]])

In [146]:
# Get back the features we've kept, zero out all other features
selected_features = pd.DataFrame(selector.inverse_transform(X_new), 
                                 index=train.index, 
                                 columns=feature_cols)
selected_features.head()

Unnamed: 0,user_id,movie_index,unix_timestamp_label,sex_label,occupation_label,zip_code_label,genre_label
97717,0,1185,31422,0,14,0,0
100124,0,2266,72974,0,18,0,0
25952,0,68,46607,0,0,0,0
25871,0,2183,29553,0,13,0,0
97255,0,1495,47137,0,4,0,0


In [147]:
selector = SelectKBest(f_classif, k=4)
X_new = selector.fit_transform(X_train_label[feature_cols], y_train)
# Get back the features we've kept, zero out all other features
selected_features = pd.DataFrame(selector.inverse_transform(X_new), 
                                 index=train.index, 
                                 columns=feature_cols)
selected_features.head()

Unnamed: 0,user_id,movie_index,unix_timestamp_label,sex_label,occupation_label,zip_code_label,genre_label
97717,606,1185,31422,0,14,0,0
100124,610,2266,72974,0,18,0,0
25952,180,68,46607,0,0,0,0
25871,178,2183,29553,0,13,0,0
97255,605,1495,47137,0,4,0,0


In [148]:
selector = SelectKBest(f_classif, k=2)
X_new = selector.fit_transform(X_train_label[feature_cols], y_train)
# Get back the features we've kept, zero out all other features
selected_features = pd.DataFrame(selector.inverse_transform(X_new), 
                                 index=train.index, 
                                 columns=feature_cols)
selected_features.head()

Unnamed: 0,user_id,movie_index,unix_timestamp_label,sex_label,occupation_label,zip_code_label,genre_label
97717,0,1185,31422,0,0,0,0
100124,0,2266,72974,0,0,0,0
25952,0,68,46607,0,0,0,0
25871,0,2183,29553,0,0,0,0
97255,0,1495,47137,0,0,0,0


In [149]:
# most important: movie_index, unix_timestamp_label, occupation_label, user_id
# let's create an embedding model with movie_index, unix_timestamp_label, occupation_label

In [153]:
def embedding_model(hidden_units, embedding_dim, merging_method):
    # Each instance will consist of 7 inputs:
    # col_embedding=['user_id', 'movie_index', 'unix_timestamp', 'sex_label', 'occupation_label', 'zip_code_label', 'genre_label']

    movie_index_input = Input(shape=(1,), name='movie_index')
    unix_timestamp_input = Input(shape=(1,), name='unix_timestamp')
    occupation_input=Input(shape=(1,), name='occupation')

    # Embeddings
    movie_embedded = Embedding(data.movie_index.max()+1, embedding_dim, 
                                        input_length=1, name='movie_embedding')(movie_index_input)
    unix_timestamp_embedded = Embedding(X_test_label.unix_timestamp_label.max()+1, embedding_dim, 
                                        input_length=1, name='unix_timestamp_embedding')(unix_timestamp_input)
    occupation_embedded= Embedding(X_train_label.occupation_label.max()+1, embedding_dim, 
                                        input_length=1, name='occupation_embedding')(occupation_input)
    # merging the embeddings
    embeddings_cols=[movie_embedded,  unix_timestamp_embedded, occupation_embedded]

    if merging_method=='concatenate':
        merged = Concatenate()(embeddings_cols)
    if merging_method=='dot_product':
        merged =Dot(name = 'dot_product', normalize = True, axes = 2)(embeddings_cols)
    if merging_method=='add':
        merged =Add()(embeddings_cols)
    if merging_method=='substract':
        merged=Subtract()(embeddings_cols)
    if merging_method=='multiply':
        merged=Multiply()(embeddings_cols)
    if merging_method=='average':
        merged=Average()(embeddings_cols)
    out = Flatten()(merged)

    # Add one or more hidden layers
    for n_hidden in hidden_units:
        out = Dense(n_hidden, activation='relu')(out)

    # A single output: our predicted rating
    out = Dense(1, activation='linear', name='prediction')(out)
    model=Model(inputs = [movie_index_input, unix_timestamp_input, occupation_input],outputs = out)
    model.compile(optimizer = 'Adam',loss='MSE',metrics=['MAE'])
    return model

In [154]:
inputs=[X_train_label.movie_index,X_train_label.unix_timestamp_label, X_train_label.occupation_label]
inputs_test=[X_test_label.movie_index,X_test_label.unix_timestamp_label, X_test_label.occupation_label]

In [165]:
mergemethod=['concatenate', 'add', 'multiply', 'average']
# merging method dot product and substract are only possible for 2 inputs / embedding
summary=pd.DataFrame(columns=['merge','val_MAE', 'MAE','epoch','val_loss', 'loss'])
merge,epoch,val_MAE,MAE,loss,val_loss=[],[],[],[],[],[]
# for prediction
# looping through the merging methods
for m in mergemethod:
    model=embedding_model(hidden_units, embedding_dim, merging_method=m)
    history=model.fit(x=inputs, y=y_train, batch_size=500,epochs=10, verbose=0, validation_data=[inputs_test,y_test], callbacks=[es])
    # collecting MAE's and loss
    merge.append(m)
    n=len(history.epoch)
    epoch.append(n)
    val_MAE.append(history.history['val_MAE'][n-1])
    MAE.append(history.history['MAE'][n-1])
    loss.append(history.history['loss'][n-1])
    val_loss.append(history.history['val_loss'][n-1])
summary['merge']=merge 
summary['val_MAE']=val_MAE 
summary['epoch']=epoch
summary['MAE']=MAE 
summary['loss']=loss 
summary['val_loss']=val_loss

ValueError: Error when checking model input: the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 3 array(s), but instead got the following list of 2 arrays: [array([[1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       ...

In [156]:
summary # worse then using all data

Unnamed: 0,merge,val_MAE,MAE,epoch,val_loss,loss
0,concatenate,0.764004,0.605479,2,0.949332,0.64158
1,add,0.762126,0.588238,2,0.951217,0.612642
2,multiply,0.855141,0.642427,3,1.137048,0.748072
3,average,0.748042,0.610416,2,0.91349,0.660176
